In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


In [184]:
import os
import yaml
import pandas as pd
import tabulate


In [186]:
config = yaml.safe_load(open('../../config/common.yaml'))
CLUSTER_ANNOT = config['cluster_annot']
CLUSTER_ORDER = config['cluster_order']
LIT_GENES = yaml.safe_load(open('../../config/literature_genes.yaml'))


In [187]:
GENE_METADATA = '../../references/gene_annotation_dmel_r6-24.feather'
BIOMARKERS = '../../output/seurat3-cluster-wf/combined_n3_biomarkers.feather'


In [188]:
fbgn2symbol = pd.read_feather(GENE_METADATA, columns=['FBgn', 'gene_symbol']).set_index('FBgn').squeeze()


In [189]:
lit_markers = pd.concat((
    pd.DataFrame({'FBgn': v, 'cell_type': [k,] * len(v)})
    for k, v in LIT_GENES.items()
)).set_index('FBgn').squeeze()


In [190]:
bm = (
    pd.read_feather(BIOMARKERS)
    .query('p_val_adj <= 0.001')
    .set_index('FBgn')
    .assign(cluster=lambda df: df.cluster.cat.rename_categories(CLUSTER_ANNOT))
    .assign(cluster=lambda df: df.cluster.cat.reorder_categories(CLUSTER_ORDER))
    .cluster
)


In [191]:
def get_diff(cluster, cell_type):
    df = pd.DataFrame({'lit_gene': False, 'biomarker': False}, index=fbgn2symbol.index)
    df.loc[df.index.isin(bm[bm == cluster].index), 'biomarker'] = True
    df.loc[df.index.isin(lit_markers[lit_markers == cell_type].index), 'lit_gene'] = True

    # results
    tbl = tabulate.tabulate(pd.crosstab(df.biomarker, df.lit_gene), headers='keys')
    common = fbgn2symbol.reindex(df[df.biomarker & df.lit_gene].index).values.tolist()
    diff = fbgn2symbol.reindex(df[df.lit_gene & ~df.biomarker].index).values.tolist()
    return tbl, common, diff


In [192]:
tbl, common, diff = get_diff("SP", "gonia")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          16528       9
True            1227       7
['bam', 'Rbp4', 'Phf7', 'p53', 'AGO3', 'Prosalpha6', 'vas']
['esg', 'neur', 'nos', 'bgcn', 'spi', 'Dad', 'fest', 'dpr17', 'tut']


In [193]:
tbl, common, diff = get_diff("EPS", "spermatocytes")
print(tbl)
print(common)
print(diff)



biomarker      False    True
-----------  -------  ------
False          16626      13
True            1122      10
['aub', 'sa', 'aly', 'can', 'Taf12L', 'tomb', 'kmg', 'CG3927', 'tbrd-1', 'nht']
['CycA', 'CycB', 'Mst87F', 'bol', 'fzo', 'mia', 'dj', 'Reepl1', 'oys', 'topi', 'd-cup', 'bb8', 'ocn']


In [194]:
tbl, common, diff = get_diff("PS1", "spermatocytes")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          16058      13
True            1690      10
['CycB', 'Mst87F', 'bol', 'fzo', 'dj', 'Reepl1', 'CG3927', 'd-cup', 'bb8', 'ocn']
['aub', 'CycA', 'sa', 'aly', 'can', 'mia', 'Taf12L', 'tomb', 'kmg', 'oys', 'topi', 'tbrd-1', 'nht']


In [195]:
tbl, common, diff = get_diff("PS2", "spermatocytes")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          16295      12
True            1453      11
['CycB', 'Mst87F', 'bol', 'fzo', 'dj', 'Reepl1', 'Taf12L', 'CG3927', 'd-cup', 'bb8', 'ocn']
['aub', 'CycA', 'sa', 'aly', 'can', 'mia', 'tomb', 'kmg', 'oys', 'topi', 'tbrd-1', 'nht']


In [196]:
tbl, common, diff = get_diff("PS3", "spermatocytes")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          17370      18
True             378       5
['Mst87F', 'dj', 'Taf12L', 'CG3927', 'ocn']
['aub', 'CycA', 'CycB', 'sa', 'aly', 'bol', 'can', 'fzo', 'mia', 'Reepl1', 'tomb', 'kmg', 'oys', 'topi', 'd-cup', 'bb8', 'tbrd-1', 'nht']


In [197]:
tbl, common, diff = get_diff("ECY", "cysc")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          17083      17
True             662       9
['tj', 'bnb', 'ImpL2', 'vn', 'Nrt', 'piwi', 'Wnt4', 'fax', 'kek1']
['eya', 'EcR', 'robo2', 'sev', 'so', 'usp', 'zfh1', 'fng', 'gbb', 'spict', 'sano', 'Cht5', 'Efa6', 'Nlg3', 'rdo', 'puc', 'br']


In [198]:
tbl, common, diff = get_diff("CY1", "cysc")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          17495      24
True             250       2
['bnb', 'fax']
['eya', 'EcR', 'tj', 'ImpL2', 'robo2', 'sev', 'so', 'usp', 'vn', 'Nrt', 'zfh1', 'piwi', 'Wnt4', 'fng', 'kek1', 'gbb', 'spict', 'sano', 'Cht5', 'Efa6', 'Nlg3', 'rdo', 'puc', 'br']


In [199]:
tbl, common, diff = get_diff("CY2", "cysc")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          16929      21
True             816       5
['bnb', 'fax', 'kek1', 'Nlg3', 'rdo']
['eya', 'EcR', 'tj', 'ImpL2', 'robo2', 'sev', 'so', 'usp', 'vn', 'Nrt', 'zfh1', 'piwi', 'Wnt4', 'fng', 'gbb', 'spict', 'sano', 'Cht5', 'Efa6', 'puc', 'br']


In [200]:
tbl, common, diff = get_diff("TE", "te")
print(tbl)
print(common)
print(diff)


biomarker      False    True
-----------  -------  ------
False          17147       0
True             618       6
['abd-A', 'Abd-B', 'cv-2', 'N', 'nord', 'Piezo']
[]


In [201]:
tbl, common, diff = get_diff("PC", "pigment")
print(tbl)
print(common)
print(diff)



biomarker      False    True
-----------  -------  ------
False          17054       0
True             715       2
['ems', 'Sox100B']
[]


In [202]:
def get_miss(cluster, cell_type):
    df = pd.DataFrame({'lit_gene': False, 'biomarker': False}, index=fbgn2symbol.index)
    df.loc[df.index.isin(bm[bm == cluster].index), 'biomarker'] = True
    df.loc[df.index.isin(lit_markers[lit_markers != cell_type].index), 'lit_gene'] = True

    # results
    tbl = tabulate.tabulate(pd.crosstab(df.biomarker, df.lit_gene), headers='keys')
    common = fbgn2symbol.reindex(df[df.biomarker & df.lit_gene].index).values.tolist()
    return tbl, common


In [203]:
tbl, common = get_miss("SP", "gonia")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          16476      61
True            1232       2
['aub', 'Taf12L']


In [204]:
tbl, common = get_miss("EPS", "spermatocytes")
print(tbl)
print(common)



biomarker      False    True
-----------  -------  ------
False          16589      50
True            1126       6
['Rbp4', 'Phf7', 'fest', 'AGO3', 'Prosalpha6', 'vas']


In [205]:
tbl, common = get_miss("PS1", "spermatocytes")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          16017      54
True            1698       2
['fest', 'dpr17']


In [206]:
tbl, common = get_miss("PS2", "spermatocytes")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          16253      54
True            1462       2
['fest', 'dpr17']


In [207]:
tbl, common = get_miss("PS3", "spermatocytes")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          17333      55
True             382       1
['fest']


In [208]:
tbl, common = get_miss("ECY", "cysc")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          17048      52
True             670       1
['N']


In [209]:
tbl, common = get_miss("CY1", "cysc")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          17466      53
True             252       0
[]


In [210]:
tbl, common = get_miss("CY2", "cysc")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          16900      50
True             818       3
['cv-2', 'Dad', 'Prosalpha6']


In [211]:
tbl, common = get_miss("TE", "te")
print(tbl)
print(common)


biomarker      False    True
-----------  -------  ------
False          17086      61
True             612      12
['Fas3', 'ImpL2', 'robo2', 'zfh1', 'spi', 'fax', 'kek1', 'Dad', 'gbb', 'Socs36E', 'puc', 'br']


In [212]:
tbl, common = get_miss("PC", "pigment")
print(tbl)
print(common)



biomarker      False    True
-----------  -------  ------
False          16990      64
True             704      13
['abd-A', 'EcR', 'robo2', 'vn', 'zfh1', 'N', 'spi', 'fax', 'kek1', 'spict', 'Socs36E', 'puc', 'br']
