In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


In [1]:
import os
from io import StringIO

from more_itertools import flatten
import pandas as pd

from larval_gonad.config import read_config
from larval_gonad.plotting.biomarkers import _unique_biomarkers, _multi_biomarkers

try:
    os.chdir(os.path.join(os.getcwd(), "docs"))
    print(os.getcwd())
except:
    pass

pd.options.display.max_rows = 200


/var/home/fearjm/Projects/larval_gonad/docs


 # Biomarkers

In [2]:
biomarkers = pd.read_feather("../output/seurat3-cluster-wf/biomarkers.feather").set_index("FBgn")


In [3]:
# Number of unique biomarkers per cluster
(
    _unique_biomarkers(biomarkers)
    .groupby("cluster")
    .size()
    .rename("Unique Biomarkers Per Cluster")
    .to_frame()
)


Unnamed: 0_level_0,Unique Biomarkers Per Cluster
cluster,Unnamed: 1_level_1
G,624
EPS,235
MPS,33
LPS,888
C1,42
C2,11
C3,149
C4,62
T,116
P,218


In [4]:
# Number of non-unique biomarkers per cluster group
(
    _multi_biomarkers(biomarkers)
    .groupby("FBgn")
    .apply(lambda x: "|".join(x.cluster))
    .rename("group")
    .reset_index()
    .groupby("group")
    .size()
    .sort_index()
    .rename("Multiple Biomarkers Per Cluster")
    .to_frame()
)


Unnamed: 0_level_0,Multiple Biomarkers Per Cluster
group,Unnamed: 1_level_1
C1|C2,12
C1|C2|C3,86
C1|C2|C3|C4,121
C1|C2|C3|C4|P,16
C1|C2|C3|C4|T,5
C1|C2|C3|C4|T|P,4
C1|C2|C3|P,6
C1|C2|C3|T,1
C1|C2|C3|T|P,3
C1|C2|C4,22


 # Literature Genes

In [5]:
lit_genes = read_config("../config/literature_genes.yaml")
del lit_genes["H"]
lit_fbgns = list(flatten(lit_genes.values()))
lit_biomarkers = biomarkers.pipe(lambda x: x[x.index.isin(lit_fbgns)])


In [6]:
num_lit_genes_with_deg = lit_biomarkers.index.unique().shape[0]
print(f"Number lit genes with DEG: {num_lit_genes_with_deg:,}")


Number lit genes with DEG: 63


In [7]:
deg_lit_genes = lit_biomarkers.groupby("FBgn").apply(lambda x: "|".join(x.cluster.sort_values()))

MAPPER = {
    "SP": ["G"],
    "SP_ES": ["G", "EPS"],
    "ES": ["EPS"],
    "PS": ["EPS", "MPS", "LPS"],
    "spermatids": ["LPS"],
    "LS": ["LPS"],
    "EC": ["C4"],
    "CY": ["C1", "C2", "C3", "C4"],
    "TE": ["T"],
    "PC": ["P"],
}

res = set()
for k, v in lit_genes.items():
    for clus in MAPPER[k]:
        for fbgn in v:
            group = deg_lit_genes.get(fbgn, "")
            if clus in group:
                res.add(fbgn)

print(
    f"# Lit Genes with conincident expression: {len(res)} ({len(res) / num_lit_genes_with_deg * 100:0.2f}%)"
)



# Lit Genes with conincident expression: 50 (79.37%)


 # Protein Traps

In [8]:
categories = ["SP", "ES", "MS", "LS", "EC", "MC", "LC", "PC", "TE"]

ptraps = (
    pd.read_csv("../data/external/miriam/lit_gene_dummy_vars.tsv", sep="\t")
    .drop(["C", "H", "References"], axis=1)
    .melt(id_vars="Fbgn", var_name="cluster", value_name="flag")
    .assign(cluster=lambda x: pd.Categorical(x.cluster, categories=categories))
    .query("flag == 1")
    .groupby("Fbgn")
    .apply(lambda x: "|".join(x.cluster.sort_values()))
    .sort_values()
    .rename("ptrap_expressed")
    .rename_axis("FBgn")
)

print(f"Number of Protein Traps: {ptraps.shape[0]:,}")


Number of Protein Traps: 73


In [9]:
ptrap_deg = (
    pd.concat(
        [
            (
                biomarkers[biomarkers.index.isin(ptraps.index)]
                .groupby("FBgn")
                .apply(lambda x: "|".join(x.cluster))
                .rename("deg")
            ),
            ptraps,
        ],
        axis=1,
        sort=True,
    )
    .dropna()
    .rename_axis("FBgn")
)



In [10]:
# Copy a csv and flag matching by hand
# print(ptrap_deg.to_csv())

hand_flags = """FBgn,deg,ptrap_expressed,hand_flag
FBgn0000014,T|P,EC,False
FBgn0000015,T,EC,False
FBgn0000146,G,SP|ES|MS,True
FBgn0000158,G,SP|ES,True
FBgn0000320,C3,EC|MC,True
FBgn0000395,T|C3,TE,True
FBgn0000404,LPS|G,SP|ES|LS,True
FBgn0000405,MPS|LPS,SP|MS|LS,True
FBgn0000546,P,EC,False
FBgn0000576,P,PC,True
FBgn0000964,C2|C4,EC,True
FBgn0001090,C1|C2|C4|C3,EC,True
FBgn0001257,T|C2|C4|C3,EC,True
FBgn0002673,LPS,ES|MS|LS,True
FBgn0002842,EPS,ES|MS,True
FBgn0002862,MPS|LPS,ES|MS|LS,True
FBgn0003984,C4|P,EC,True
FBgn0004108,C2|C4,EC,True
FBgn0004372,EPS,ES|MS|LS,True
FBgn0004606,T|P,EC,False
FBgn0004647,T|C4|P,TE,True
FBgn0004872,C4,EC|MC,True
FBgn0005672,T|P,SP|ES,False
FBgn0010258,MPS|EPS|G,ES|MS|LS,True
FBgn0010453,C2|C4|C3,EC,True
FBgn0011206,LPS,ES|MS|LS,True
FBgn0011591,C4,EC,True
FBgn0011596,LPS,ES|MS|LS,True
FBgn0014163,C1|T|C2|C4|C3|P,EC,True
FBgn0015399,T|C2|C4|C3|P,EC|MC|LC,True
FBgn0019828,MPS|LPS,ES|MS|LS,True
FBgn0020493,T|C3,SP,False
FBgn0024234,T,EC,False
FBgn0024288,P,PC,True
FBgn0030313,MPS|LPS,ES|MS|LS,True
FBgn0031091,EPS|G,SP|ES,True
FBgn0031623,MPS|EPS|G,ES|MS|LS,True
FBgn0031715,EPS|G,ES|MS|LS,True
FBgn0032451,P,EC,False
FBgn0032473,EPS|G,ES|MS|LS,True
FBgn0034435,MPS|LPS|EPS,ES|MS|LS,True
FBgn0034739,MPS|LPS|EPS,ES|MS|LS,True
FBgn0038089,MPS|LPS,SP|ES|MS,True
FBgn0039044,G,SP|ES,True
FBgn0039071,MPS|LPS,ES|MS|LS,True
FBgn0039124,EPS,ES|MS|LS,True
FBgn0041102,MPS|LPS,LS|TE,True
FBgn0041103,MPS|EPS|G,ES|MS|LS,True
FBgn0050418,T,TE,True
FBgn0051361,MPS|LPS,SP|ES|MS,True
FBgn0083963,C3,EC|MC|LC,True
FBgn0243486,C1|C3,MC|LC,True
FBgn0243512,T|C3|P,EC,True
FBgn0250816,G,SP,True
FBgn0250843,C1|C2|G,ES,True
FBgn0264953,T,TE,True
FBgn0283442,G,SP|ES|MS,True
FBgn0283451,T|P,EC,False
"""

cnts = pd.read_csv(StringIO(hand_flags)).hand_flag.value_counts()
total = cnts.sum()
pct = cnts[True] / total * 100
print(f"We looked at {total:,} protein traps")
print(f"{pct:,.2f} protein traps had some overlap")


We looked at 58 protein traps
84.48 protein traps had some overlap
