In [1]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from larval_gonad.notebook import Nb
from larval_gonad.plotting import make_figs
from larval_gonad.config import memory

# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../output/combined_testis_force')

last updated: 2018-04-02 
Git hash: 6ffa85dd58c284a310b117876de44df894907c6b


In [2]:
norm = nbconfig.seurat.get_normalized_read_counts()
clusters = nbconfig.seurat.get_clusters()

In [3]:
def get_cluster_genes(gt, lt):
    name = f'gt{gt}_lt{lt}_'
    res = pd.DataFrame(index=norm.index, columns=[name + str(x) for x in sorted(clusters['res.0.4'].unique())]).fillna(False)

    for i, c in clusters.groupby('res.0.4'):
        cells = c.index
        inclus = norm[cells]
        outclus = norm.loc[:, ~norm.columns.isin(cells)]
        n_inclus = inclus.shape[1]
        n_outclus = outclus.shape[1]

        expressed_inclus = (inclus > 0).sum(axis=1)
        expressed_outclus = (outclus > 0).sum(axis=1)

        half_expressed_inclus = (expressed_inclus / n_inclus * 100 >= gt)
        pct_expressed_outclus = (expressed_outclus / n_outclus * 100 <= lt)

        res.loc[pd.concat([half_expressed_inclus, pct_expressed_outclus], axis=1).sum(axis=1) > 1, name + str(i)] = True
        
    return res

In [4]:
results_60_1 = get_cluster_genes(60, 1)
results_60_5 = get_cluster_genes(60, 5)
results_60_10 = get_cluster_genes(60, 10)
results_50_1 = get_cluster_genes(50, 1)
results_50_5 = get_cluster_genes(50, 5)
results_50_10 = get_cluster_genes(50, 10)

In [5]:
results = pd.concat([results_60_1, results_60_5, results_60_10, 
                     results_50_1, results_50_5, results_50_10], axis=1)
results = results.stack().to_frame()
results.columns = ['flag']
results.reset_index(level=-1, inplace=True)

results = results[results.flag].copy()
results.drop('flag', inplace=True, axis=1)

results['cluster'] = results['level_1'].map(lambda x: x.split('_')[-1]).astype(np.int64)
results['pct_in_cluster'] = results['level_1'].map(lambda x: x.split('_')[0])
results['pct_outside_cluster'] = results['level_1'].map(lambda x: x.split('_')[1])

results.drop('level_1', axis=1, inplace=True)
results = results[['cluster', 'pct_in_cluster', 'pct_outside_cluster']]
results.reset_index(inplace=True)

In [6]:
sums = norm.T.join(clusters['res.0.4']).groupby("res.0.4").sum().T
sums = sums.unstack().to_frame().reset_index(level=0)
sums.columns = ['cluster', 'sum']
sums.index.name = 'FBgn'
sums.reset_index(inplace=True)

In [7]:
res = {}
for i, g in norm.T.join(clusters['res.0.4']).groupby('res.0.4'):
    res[i] = (g > 0).sum()

sizes = pd.DataFrame(res)

sizes = sizes.unstack().to_frame().reset_index(level=0)
sizes.columns = ['cluster', 'num_cells']
sizes.index.name = 'FBgn'
sizes.reset_index(inplace=True)

In [8]:
on = ['FBgn', 'cluster']
merged = results.merge(sizes, on=on).merge(sums, on=on)

In [9]:
merged['gene'] = merged.FBgn.map(lambda x: nbconfig.fbgn2symbol[x])
merged.set_index(['FBgn', 'gene'], inplace=True)

In [10]:
merged.sort_values('cluster', inplace=True)

In [12]:
merged.reset_index().to_excel('../output/2018-04-02_cluster_flags.xlsx', index=False)