# Slice and Dice Biomarkers

Brian is wanting the top 10 named biomarkers for each cluster to start doing a literature review. There are several ways that this slice and dice can be done, so it will probably be easier to present him with a few tables.

In [1]:
import sys
import os
from pathlib import Path

import numpy as np
import pandas as pd

sys.path.insert(0, '../lib')
from larval_gonad.x_to_a import CHROMS_CHR

In [2]:
# Constants
REF = os.environ['REFERENCES_DIR']
OUTPUT = '../output/testis_scRNAseq_pilot'
Path(OUTPUT).mkdir(exist_ok=True)
NAME = '2018-02-01_slice_and_dice_biomarkers'

In [3]:
# Create fbgn2symbol and symbol2fbgn map
annot = pd.read_csv(Path(REF, 'dmel/r6-16/fb_annotation/dmel_r6-16.fb_annotation'),
                   sep='\t', index_col=1)

fbgn2symbol = annot['gene_symbol'].to_dict()
symbol2fbgn = {v: k for k, v in fbgn2symbol.items()}

# Create fbgn2chrom
genes = []
with Path(REF, 'dmel/r6-16/gtf/dmel_r6-16.gtf').open() as fh:
    for row in fh:
        rows = row.strip().split()
        
        if len(rows) == 0:
            continue
            
        if rows[2] == 'gene':
            genes.append((rows[0], rows[9].replace('"', '').replace(';', '')))

fbgn2chrom = pd.DataFrame(genes, columns=['chrom', 'FBgn'])
fbgn2chrom.set_index('FBgn', inplace=True)
fbgn2chrom = fbgn2chrom[fbgn2chrom['chrom'].isin(CHROMS_CHR)]

In [4]:
# Get biomarker datas and cleanup
df = pd.read_csv(f'{OUTPUT}/biomarkers.tsv', sep='\t', index_col='gene')
df.index.name = 'FBgn'
df['gene'] = df.index.map(lambda x: fbgn2symbol[x])
df.set_index('gene', append=True, inplace=True)

In [5]:
# Remove CG and CRs 
cg = ~df.index.get_level_values('gene').str.startswith('CG')
cr = ~df.index.get_level_values('gene').str.startswith('CR')
pv = df.p_val_adj < .01
df = df[cg & cr & pv]
df.to_csv(f'{OUTPUT}/{NAME}_named_cluster_markers.tsv', sep='\t')

In [6]:
# Sort by adj p-val
clean = df.sort_values(by='p_val_adj').groupby('cluster').head(10).sort_values('cluster').drop(['p_val', 'pct.1', 'pct.2'], axis=1)
clean['link'] = clean.index.get_level_values('FBgn').map(lambda fbgn: '=HYPERLINK("http://flybase.org/reports/{}", "FlyBase")'.format(fbgn))
clean.to_csv(f'{OUTPUT}/{NAME}_top10_adj-pval_cluster_markers.tsv', sep='\t')

In [7]:
# Sort by logFC
df['abs_avg_logFC'] = np.abs(df.avg_logFC)
clean = df.sort_values(by='abs_avg_logFC', ascending=False).groupby('cluster').head(10).sort_values('cluster').drop(['p_val', 'pct.1', 'pct.2'], axis=1)
clean['link'] = clean.index.get_level_values('FBgn').map(lambda fbgn: '=HYPERLINK("http://flybase.org/reports/{}", "FlyBase")'.format(fbgn))
clean.to_csv(f'{OUTPUT}/{NAME}_top10_avg-logFC_cluster_markers.tsv', sep='\t')

In [8]:
# sort by difference pct cells expressed
df['pct_diff'] = np.abs(df['pct.1'] - df['pct.2'])
clean = df.sort_values(by='pct_diff', ascending=False).groupby('cluster').head(10).sort_values('cluster').drop(['p_val'], axis=1)
clean['link'] = clean.index.get_level_values('FBgn').map(lambda fbgn: '=HYPERLINK("http://flybase.org/reports/{}", "FlyBase")'.format(fbgn))
clean.to_csv(f'{OUTPUT}/{NAME}_top10_pct-cells-diff_cluster_markers.tsv', sep='\t')