# Identifying Cell Types of the Testis

In [149]:
import os
import sys
import re
from pathlib import Path
from yaml import load

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

## Data

In [150]:
# load all of my config settings
config = {}
with open('../config/common.yaml') as fh:
    config.update(load(fh.read()))
    
with open('../config/colors.yaml') as fh:
    config['colors'] = load(fh.read())
    
with open('../science_submission/config.yaml') as fh:
    config.update(load(fh.read()))

In [151]:
# Get list of genes from the literature
symbol2fbgn = pd.read_pickle('../output/science_submission/symbol2fbgn.pkl')
lit_genes = config['lit_genes_long']
lit_genes_fbgn = [symbol2fbgn[x] for x in config['lit_genes_long']]
print(len(lit_genes))
print(', '.join(sorted(lit_genes, key=lambda x: x.lower())))

72
abd-A, Abd-B, aly, apt, bam, bgcn, bnb, bol, bw, c-cup, can, CG11697, CG18628, CG3927, CG8368, d-cup, dj, EcR, Eip93F, ems, eya, fax, foxo, fzo, gbb, glob1, hui, ImpL2, kek1, Marf, mia, mle, Mst87F, MtnA, N, neur, nht, nord, nos, Nrt, ocn, oys, p-cup, p53, peb, Phf7, puc, r-cup, Rbp9, retn, robo2, sa, sano, sev, Six4, so, soti, sowi, Sox100B, spict, sunz, tej, tj, topi, tut, vas, vkg, vn, wa-cup, Wnt2, Wnt4, zfh1


In [152]:
# Get mappint of cell_id to short cluster name, remove unknown clusters
clusters = (
    pd.read_parquet('../output/scrnaseq-wf/clusters.parquet')
    .assign(cluster = lambda df: pd.Categorical(df.cluster.map(config['short_cluster_annot']), ordered=True, categories=config['short_cluster_order']))
    .dropna()
)
clusters.cluster.value_counts().sort_index().map(lambda x: f'{x:,}').rename('Cells Per Cluster').to_frame()

Unnamed: 0,Cells Per Cluster
SP,1367
E1º,1418
M1º,1651
L1º,2014
EC,1373
MC,1744
LC,1384
TE,946
PC,707


In [153]:
# Get biomarkers
resolution = config['resolution']
biomarkers = (
    pd.read_csv(f'../output/scrnaseq-wf/scrnaseq_combine_force/biomarkers_{resolution}.tsv', sep='\t', index_col=0)
    .rename_axis('FBgn')
    .query('p_val_adj <= 0.05')
    .assign(cluster = lambda df: pd.Categorical(df.cluster.map(config['short_cluster_annot']), ordered=True, categories=config['short_cluster_order']))
    .dropna()
    .loc[:, ['gene_symbol', 'cluster']]
)
display(biomarkers.groupby('cluster').size().sort_index().map(lambda x: f'{x:,}').rename('Biomarkers per cluster').to_frame())
print(f'There are {biomarkers.index.unique().shape[0]:,} biomakrer genes')

Unnamed: 0_level_0,Biomarkers per cluster
cluster,Unnamed: 1_level_1
SP,1383
E1º,1417
M1º,1744
L1º,760
EC,616
MC,352
LC,583
TE,638
PC,733


There are 4,872 biomakrer genes


In [154]:
print(f'Of the {len(lit_genes_fbgn):,} literature genes, only {biomarkers.query(f"FBgn == {lit_genes_fbgn}").index.unique().shape[0]:,} where in the biomarker list.')

Of the 72 literature genes, only 56 where in the biomarker list.


In [157]:
# Figure out how many literature genes were called biomarker for the right cluster.
PASS = 0
TOTAL = 0
def get_lit_gene_subset(cell_type):
    if cell_type == 'gonia':
        name = 'Spermatogonia'
        idx = (0, 12)
    elif cell_type == 'primary':
        name = 'Primary Spermatocytes'
        idx = (12, 34)
    elif cell_type == 'cyst':
        name = 'Somatic Cyst Cells'
        idx = (34, 58)
    elif cell_type == 'te':
        name = 'Terminal Epithelium'
        idx = (58, 67)
    elif cell_type == 'pc':
        name = 'Pigment Cells'
        idx = (67,71)
    display(HTML(f'<h3>{name}</h3>'))
    print(', '.join(sorted(lit_genes[idx[0]:idx[1]], key=lambda x: x.lower())))
    return lit_genes_fbgn[idx[0]: idx[1]]

def check_biomarkers(fbgns, cell_type_pattern):
    subset = (
        biomarkers.query(f'FBgn == {fbgns}')
        .groupby('gene_symbol')
        .apply(lambda df: '|'.join(df.cluster.sort_values().values))
        .rename('clusters')
        .to_frame()
        .assign(lower=lambda df: df.index.str.lower())
        .sort_values(by='lower')
        .drop('lower', axis=1)
    )
    num_with_correct_cell_type = subset.clusters.str.contains(cell_type_pattern).sum()
    total_num_genes = subset.shape[0]
    
    global PASS
    global TOTAL
    PASS += num_with_correct_cell_type
    TOTAL += total_num_genes
    
    print(f'There were ({num_with_correct_cell_type:,} / {total_num_genes:,} = {num_with_correct_cell_type / total_num_genes * 100:.2f}%) literature genes that were called biomarker in the correct cluster.')
    display(subset)

fbgns = get_lit_gene_subset('gonia')
check_biomarkers(fbgns, 'SP')

fbgns = get_lit_gene_subset('primary')
check_biomarkers(fbgns, 'E1º|M1º|L1º')

fbgns = get_lit_gene_subset('cyst')
check_biomarkers(fbgns, 'EC|MC|LC')

fbgns = get_lit_gene_subset('te')
check_biomarkers(fbgns, 'TE')

fbgns = get_lit_gene_subset('pc')
check_biomarkers(fbgns, 'PC')

print(f'{PASS:,} / {TOTAL:,} = {PASS / TOTAL:,.2f}%')

bam, bgcn, CG11697, Marf, nos, p53, peb, Phf7, Rbp9, tej, tut, vas
There were (5 / 7 = 71.43%) literature genes that were called biomarker in the correct cluster.


Unnamed: 0_level_0,clusters
gene_symbol,Unnamed: 1_level_1
bam,SP
CG11697,E1º|M1º|L1º
p53,SP
peb,LC
Phf7,SP
Rbp9,SP
vas,SP


aly, bol, c-cup, can, CG3927, d-cup, dj, fzo, mia, mle, Mst87F, nht, ocn, oys, p-cup, r-cup, sa, soti, sowi, sunz, topi, wa-cup
There were (18 / 18 = 100.00%) literature genes that were called biomarker in the correct cluster.


Unnamed: 0_level_0,clusters
gene_symbol,Unnamed: 1_level_1
aly,E1º
bol,E1º|M1º
c-cup,E1º|M1º|L1º
can,E1º
CG3927,E1º|M1º|L1º
d-cup,M1º
dj,E1º|M1º|L1º
fzo,M1º
Mst87F,E1º|M1º|L1º
nht,SP|E1º


apt, bnb, CG8368, EcR, Eip93F, eya, fax, foxo, gbb, glob1, hui, ImpL2, kek1, neur, Nrt, puc, robo2, sano, sev, spict, tj, vn, Wnt4, zfh1
There were (12 / 19 = 63.16%) literature genes that were called biomarker in the correct cluster.


Unnamed: 0_level_0,clusters
gene_symbol,Unnamed: 1_level_1
bnb,EC|LC
CG8368,E1º|M1º
EcR,TE|PC
Eip93F,LC
fax,EC|MC|LC|TE|PC
foxo,EC
gbb,TE
glob1,MC|LC
hui,EC
ImpL2,EC|TE


abd-A, Abd-B, CG18628, MtnA, nord, retn, Six4, so, Wnt2
There were (5 / 7 = 71.43%) literature genes that were called biomarker in the correct cluster.


Unnamed: 0_level_0,clusters
gene_symbol,Unnamed: 1_level_1
abd-A,TE|PC
Abd-B,TE
MtnA,EC|MC|LC
nord,TE
retn,TE
Six4,EC|MC|LC|TE
Wnt2,MC|LC


bw, N, Sox100B, vkg
There were (4 / 4 = 100.00%) literature genes that were called biomarker in the correct cluster.


Unnamed: 0_level_0,clusters
gene_symbol,Unnamed: 1_level_1
bw,PC
N,EC|TE|PC
Sox100B,PC
vkg,PC


44 / 55 = 0.80%
