# Package Up Differential Expression Results

In [1]:
import os
import sys
from pathlib import Path
from string import ascii_uppercase

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from larval_gonad.notebook import Nb
from larval_gonad.plotting import make_figs
from larval_gonad.config import memory
from larval_gonad.scRNAseq import CLUSTER_ANNOT

# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../output/scrnaseq_combine_force')

last updated: 2018-05-17 
Git hash: 2ab7e147bd4d8cf2b5529684ed180aa07ed7c54e


In [2]:
writer = pd.ExcelWriter('../output/2018-05-17_differential_expression_results.xlsx')
cell_format = writer.book.add_format({'valign': 'top'})
cell_format.set_text_wrap()

## One vs Rest

In [3]:
def _add_sheet(writer, sheet_name, fname, alpha=0.01, comment=None, headers=None):
    sheet = writer.book.add_worksheet(sheet_name)
    writer.sheets[sheet_name] = sheet
    sheet.set_column(0, 1, 20)
    
    df = pd.read_csv(Path(nbconfig.seurat_dir, fname), sep='\t').query(f'p_val_adj <= {alpha}')
    df.sort_values(by='avg_logFC', ascending=False, inplace=True)
    
    if 'cluster' in df.columns:
        df.sort_values(by='cluster', inplace=True)
        df.cluster.replace(CLUSTER_ANNOT, inplace=True)
        idx = df.columns.tolist().index('cluster')
        sheet.set_column(idx, idx, 20)
        
    if headers:
        df.rename({'pct.1': headers[0], 'pct.2': headers[1]}, inplace=True, axis=1)
    df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1, freeze_panes=(2, 2))
    
    if comment:
        sheet.set_row(0, 100, cell_format)
        sheet.merge_range('A1:G1', comment)
        

In [4]:
comment = """\
This is the initial differential expression analysis to identify 'biomarkers' for lack of a better term. \
Here we take each cluster and compare it's expression to all other cells. \
This creates a list of genes that are upregulated in each cluster. \
This table is grouped by clusters and sorted by avg_logFC.\
"""
_add_sheet(writer, 'One vs Rest (biomarkers)', 'biomarkers_res.0.6.tsv', comment=comment)

In [5]:
comment = """\
Here I have done a differential expression between the germ cell and somatic cell lines. \
For this analysis I combine all of the germ cell clusters vs all of the somatic cell clusters.

Positve avg_logFC are germ biased genes.
Negative avg_logFC are soma biased genes.
"""
_add_sheet(
    writer, 
    'Germ Cells vs Somatic Cells', 
    '2018-05-16_scrnaseq_germ_vs_soma_biomarkers.tsv', 
    comment=comment,
    headers=('pct.germ', 'pct.soma')
)

In [6]:
comment = """\
Here I have done a differential expression of spermatogonia vs 1º spermatocytes. \
For this analysis I took the spermatogonia cluster and compared it to all spermatocyte \
clusters combined together.

Positve avg_logFC are spermatogonia biased genes.
Negative avg_logFC are 1º spermatocyte biased genes.
"""
_add_sheet(
    writer, 
    'Gonia vs Cytes', 
    '2018-05-16_scrnaseq_spermatogonia_vs_spermatocytes_biomarkers.tsv', 
    comment=comment,
    headers=('pct.gonia', 'pct.cytes')
)

In [7]:
comment = """\
Here I have done a differential expression of Early 1º spermatocytes vs Mid and Late 1º spermatocytes.

Positve avg_logFC are early 1º spermatocyte biased genes.
Negative avg_logFC are mid and late 1º spermatocyte biased genes.
"""
_add_sheet(
    writer, 
    'Early cytes vs Mid and Late', 
    '2018-05-16_scrnaseq_early_spermatocytes_vs_spermatocytes_biomarkers.tsv', 
    comment=comment,
    headers=('pct.early', 'pct.midLate')
)

In [8]:
comment = """\
Here I have done a differential expression of Mid 1º spermatocytes vs Early and Late 1º spermatocytes.

Positve avg_logFC are mid 1º spermatocyte biased genes.
Negative avg_logFC are early and late 1º spermatocyte biased genes.
"""
_add_sheet(
    writer, 
    'Mid cytes vs Early and Late', 
    '2018-05-16_scrnaseq_mid_spermatocytes_vs_spermatocytes_biomarkers.tsv', 
    comment=comment,
    headers=('pct.mid', 'pct.earlyLate')
)

In [9]:
comment = """\
Here I have done a differential expression of Late 1º spermatocytes vs Early and Mid 1º spermatocytes.

Positve avg_logFC are late 1º spermatocyte biased genes.
Negative avg_logFC are early and mid 1º spermatocyte biased genes.
"""
_add_sheet(
    writer, 
    'Late cytes vs Early and Mid', 
    '2018-05-16_scrnaseq_late_spermatocytes_vs_spermatocytes_biomarkers.tsv', 
    comment=comment,
    headers=('pct.late', 'pct.earlyMid')
)

In [10]:
comment = """\
Cluster eleven is a little bit of a mystery to us. \
It behaves kind of like a 1º spermatocyte, but has very low expression. \
Here I run a differential expression between cluster eleven and the 1º spermatocyte clusters.

Positve avg_logFC are cluster 11 biased genes.
Negative avg_logFC are 1º spermatocyte biased genes.
"""
_add_sheet(
    writer, 
    'Cluster 11 vs cytes', 
    '2018-05-16_scrnaseq_eleven_vs_spermatocytes_biomarkers.tsv', 
    comment=comment,
    headers=('pct.eleven', 'pct.cytes')
)

In [11]:
writer.close()