In [210]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from larval_gonad.notebook import Nb

In [211]:
# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../output/scrnaseq-wf/scrnaseq_combine_force')

last updated: 2018-07-12 
Git hash: ac53eab8570ee235fc7e9d15c84d3e028437e0a6


In [212]:
# Open writer and add some formats
writer = pd.ExcelWriter('../output/2018-07-12_results_table.xlsx')
cell_format = writer.book.add_format({'valign': 'top'})
cell_format.set_text_wrap()

In [213]:
# README Sheet
sheet = writer.book.add_worksheet('README')
writer.sheets['README'] = sheet

comment = """\
Genes with missing counts were not captured by the experiment. 
"""

sheet.set_column(0, 0, width=30, cell_format=cell_format)
sheet.set_column(1, 1, width=50, cell_format=cell_format)
sheet.set_row(0, height=50, cell_format=cell_format)
sheet.merge_range('A1:C1', comment);

col_desc = {
    'FBgn': 'FlyBase Gene ID',
    'gene_symbol': 'Gene Symbol', 
    'chrom': 'Chromosomal Arm',
    'Total Read Cnts': 'Sum of raw read counts across cells',
    'Median Scaled Read Cnts': 'Sum of raw read counts across cells scaled by median total read count across genes', 
    'Cluster p_val Cluster': 'Seurat FindMarkers p-value', 
    'Cluster p_val_adj': 'FDR Adjusted p-value', 
    'Cluster avg_logFC': 'Average log Fold Change', 
    'Cluster Prop Cells in Cluster': 'Proportion of Cells in cluster 0 with gene expressed.',
    'Cluster Prop Cells in Other Clusters': 'Proportion of Cells in all other clusters with gene expressed.', 
    'Cluster TPM Read Cnts': 'TPM normalized read count', 
    'Cluster RPKM Read Cnts': 'RPKM normalized read count',
    'Z-score Cluster #': 'RPKM normalized read count',
}

_desc = pd.Series(col_desc).to_frame()
_desc.index.name = 'Column Name'
_desc.columns = ['Column Description']
_desc.to_excel(writer, sheet_name='README', startrow=1)

In [214]:
# Gene annotation with symbols
gene_annot = pd.Series(nbconfig.fbgn2symbol)
gene_annot.name = 'gene_symbol'
gene_annot = gene_annot.to_frame().join(nbconfig.fbgn2chrom)

In [215]:
# Total counts
raw = pd.read_parquet('../output/scrnaseq-wf/raw_by_cluster.parquet')

total = raw.sum(axis=1)
total.name = 'Total Read Cnts'

scaled = total / total.median()
scaled.name = 'Median Scaled Read Cnts'

In [216]:
def prepend(dat, text):
    #dat.columns = pd.MultiIndex.from_arrays([[text] * len(dat.columns), dat.columns])
    dat.columns = [text + '|' + x for x in dat.columns]

In [217]:
# import normalized counts
tpm = pd.read_parquet('../output/scrnaseq-wf/tpm.parquet')
prepend(tpm, 'TPM Read Counts')

rpkm = pd.read_parquet('../output/scrnaseq-wf/rpkm.parquet')
prepend(rpkm, 'RPKM Read Cnts')

zscore = pd.read_parquet('../output/scrnaseq-wf/tpm_zscore.parquet')
prepend(zscore, 'Z-score')

In [218]:
# Unstack biomarker information
biomarker = nbconfig.seurat.get_biomarkers('res.0.6')
biomarker.drop('gene_symbol', axis=1, inplace=True)

dfs = []
for g, dd in biomarker.groupby('cluster'):
    _clus = nbconfig.CLUSTER_ANNOT[g]
    bio = dd.drop('cluster', axis=1)[['p_val', 'p_val_adj', 'avg_logFC', 'pct.1', 'pct.2']]
    bio.rename({'pct.1': f'Per Cells in Cluster', 'pct.2': 'Per Cells in Other Clusters'}, axis=1, inplace=True)
    prepend(bio, f'{_clus}')
    dfs.append(bio)
   

In [219]:
#results = pd.concat([gene_annot, *dfs, tpm, rpkm, zscore], axis=1, sort=True)
results = pd.concat([gene_annot, total, scaled, *dfs, tpm, rpkm, zscore], axis=1, sort=True)
results.index.name = 'FBgn'
results.set_index('gene_symbol', append=True, inplace=True)

In [220]:
# split into multi-index
upper = []
lower = []
for header in results.columns.str.split('|').tolist():
    if len(header) == 2:
        upper.append(header[0])
        lower.append(header[1])
    else:
        upper.append(' ')
        lower.append(header[0])

In [221]:
results.columns = pd.MultiIndex.from_arrays([upper, lower])
results.to_excel(writer, sheet_name='data_table')

In [223]:
# Close sheet
writer.save()