Sharvani was needing a table of biomarkers with stock ids. This is quick and dirty because we are on a time crunch.

In [9]:
import os
from pathlib import Path
import pandas as pd

In [119]:
# load stocks data and split into bloomington and janelia
stocks = pd.read_csv('../../data/external/bloomington_gal4.tsv', 
                     comment='#', sep='\t', usecols=['stk_id', 'gene_symbol', 'symbol'])

janelia = stocks[stocks.symbol.str.contains('GMR')]
bloomington = stocks[~stocks.symbol.str.contains('GMR')]

In [125]:
# collapse stock numbers to a list
def collapse_stocks(df, name='stocks'):
    stocks = df.stk_id.astype(int).tolist()
    return pd.Series([
        stocks
    ], [name]) 

bs = bloomington.groupby('gene_symbol').apply(lambda x: collapse_stocks(x, 'bloomington_stocks'))
js = janelia.groupby('gene_symbol').apply(lambda x: collapse_stocks(x, 'janelia_stocks'))
stocks_collapsed = bs.join(js)

In [127]:
# load biomarker data
biomarkers = pd.read_csv('../../output/testes_scRNAseq_pilot/biomarkers.tsv', sep='\t', index_col='gene')

In [129]:
# load fbgn2symbol data
REF = os.environ['REFERENCES_DIR']

fbgn2symbol = pd.read_csv(Path(REF, 'dmel/r6-16/fb_annotation/dmel_r6-16.fb_annotation'), 
                          sep='\t', usecols=['primary_FBgn', 'gene_symbol'], index_col='primary_FBgn')

symbol2fbgn = fbgn2symbol.reset_index().set_index('gene_symbol')

In [141]:
# put evertyhing together
bioStock = biomarkers.join(fbgn2symbol, how='left').merge(stocks_collapsed, left_on='gene_symbol', right_index=True, how='left').sort_values(by=['cluster'])

In [None]:
# add links to flybase for easy browsing
def linkify(fbgn):
    link = f'http://flybase.org/reports/{fbgn}.html'
    return f'=HYPERLINK("{link}","FlyBase")'

bioStock['LinkOut'] = bioStock.index.map(lambda x: linkify(x))

In [161]:
# Clean up table
bioStock.index.name = 'FBgn'
bioStock.set_index('gene_symbol', append=True, inplace=True)
header = ['cluster', 'p_val', 'p_val_adj', 'avg_logFC', 'pct.1', 'pct.2', 
          'bloomington_stocks', 'janelia_stocks', 'LinkOut']
bioStock = bioStock[header].copy()

In [178]:
# sort 
bioStock.sort_values(by='p_val_adj', ascending=False, inplace=True)
bioStock.sort_values(by='avg_logFC', inplace=True)
bioStock.sort_values(by='cluster', inplace=True)

In [180]:
# write data
bioStock.to_csv('../../output/20171205_testis_biomarkers_with_stocks.tsv', sep='\t')