In [374]:
import os
import sys
import re
from pathlib import Path
from collections import defaultdict
from itertools import combinations

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import mannwhitneyu, ks_2samp, friedmanchisquare, kruskal
from statsmodels.stats.multitest import multipletests

# Project level imports
from larval_gonad.notebook import Nb
from larval_gonad.x_to_a import commonly_expressed

In [267]:
# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../output/scrnaseq-wf/scrnaseq_combine_force')

last updated: 2018-11-01 
Git hash:


In [338]:
cluster_annot = {
    0: 'LS',
    1: 'MC',
    2: 'MS',
    3: 'ES',
    4: 'LC',
    5: 'EC',
    6: 'SP',
    7: 'TE',
    8: 'PC',
}

cluster_order = [
    'SP',
    'ES',
    'MS',
    'LS',
    'EC',
    'MC',
    'LC',
    'TE',
    'PC',
]

clusters = nbconfig.seurat.get_clusters('res.0.6')
clusters = clusters[(clusters != 9) & (clusters != 10) & (clusters != 11)].copy()
clusters = clusters.map(cluster_annot)
clusters = pd.Series(pd.Categorical(clusters.values, categories=cluster_order, ordered=True), index=pd.Index(clusters.index, name='cell_id'), name='cluster').to_frame()
clusters['rep'] = clusters.index.str.extract('(rep\d)_').values.flatten()

In [339]:
chroms = nbconfig.fbgn2chrom.query('chrom != "chrY" & chrom != "chrM" & chrom != "chr4"').copy()
chroms.chrom = chroms.chrom.astype('category')
chroms.chrom = chroms.chrom.cat.reorder_categories(nbconfig.chrom_order[:-2])

In [340]:
norm = nbconfig.seurat.get_normalized_read_counts()

In [378]:
norm = norm.reindex(commonly_expressed(data=norm))

In [379]:
def prop_on(x):
    return x.sum() / x.shape[0]


def permutation_sample(data1, data2):
    data = np.concatenate((data1, data2))
    permuted_data = np.random.permutation(data)
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]
    
    return perm_sample_1, perm_sample_2


def statistic(data1, data2, test='ks'):
    if test == 'ks':
        stat, _ = ks_2samp(data1, data2)
    elif test == 'mann':
        stat, _ = mannwhitneyu(data1, data2)
        
    return stat


def draw_perm_reps(data1, data2, func, size=1):
    perm_replicates = np.empty(size)
    for i in range(size):
        perm_sample_1, perm_sample_2 = permutation_sample(data1, data2)
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)
        
    return perm_replicates

In [380]:
def _run1(cluster, rep, dat):
    prop_genes_on = (dat > 0).join(chroms).groupby('chrom').agg(prop_on).T
    
    
    # Permutation test for each pairwise chromosome get stats
    pvals = list()
    cbn = list(combinations(prop_genes_on.columns, 2))
    for c1, c2 in cbn:
        _d1 = prop_genes_on[c1]
        _d2 = prop_genes_on[c2]
        
        obs = statistic(_d1, _d2)
        perms = draw_perm_reps(_d1, _d2, func=statistic, size=1_000)
        pval = np.sum(perms >= obs) / len(perms)
        pvals.append((c1, c2, pval))
    
    return pvals


results = []
for (clus, rep), dd in clusters.groupby(['cluster', 'rep']):
    _dat = norm.reindex(columns=dd.index)
    for res in _run1(clus, rep, _dat):
        results.append([clus, rep, *res])

df = pd.DataFrame(results, columns=['cluster', 'rep', 'chrom1', 'chrom2', 'p-value'])
wide = df.pivot_table(index=['cluster', 'chrom1', 'chrom2'], columns='rep', values='p-value')

In [381]:
print('SP')
display(wide.loc['SP'])

print('MC')
display(wide.loc['MC'])

print('MS')
display(wide.loc['MS'])

SP


Unnamed: 0_level_0,rep,rep1,rep2,rep3
chrom1,chrom2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr2L,chr2R,0.349,0.509,0.669
chr2L,chr3L,0.098,0.344,0.208
chr2L,chr3R,0.07,0.041,0.006
chr2R,chr3L,0.654,0.744,0.349
chr2R,chr3R,0.617,0.235,0.057
chr3L,chr3R,0.822,0.292,0.028
chrX,chr2L,0.029,0.269,0.013
chrX,chr2R,0.39,0.862,0.044
chrX,chr3L,0.965,0.697,0.286
chrX,chr3R,0.898,0.133,0.002


MC


Unnamed: 0_level_0,rep,rep1,rep2,rep3
chrom1,chrom2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr2L,chr2R,0.0,0.008,0.366
chr2L,chr3L,0.0,0.002,0.0
chr2L,chr3R,0.0,0.056,0.0
chr2R,chr3L,0.0,0.0,0.0
chr2R,chr3R,0.0,0.0,0.0
chr3L,chr3R,0.008,0.097,0.0
chrX,chr2L,0.004,0.042,0.0
chrX,chr2R,0.031,0.365,0.0
chrX,chr3L,0.0,0.0,0.007
chrX,chr3R,0.0,0.002,0.0


MS


Unnamed: 0_level_0,rep,rep1,rep2,rep3
chrom1,chrom2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chr2L,chr2R,0.035,0.381,0.0
chr2L,chr3L,0.0,0.037,0.0
chr2L,chr3R,0.007,0.263,0.001
chr2R,chr3L,0.001,0.635,0.0
chr2R,chr3R,0.743,0.91,0.963
chr3L,chr3R,0.011,0.537,0.0
chrX,chr2L,0.0,0.0,0.0
chrX,chr2R,0.0,0.0,0.0
chrX,chr3L,0.005,0.0,0.0
chrX,chr3R,0.0,0.0,0.0


In [382]:
def _run2(cluster, rep, dat):
    prop_genes_on = (dat > 0).join(chroms).groupby('chrom').agg(prop_on).T
    _med = prop_genes_on[['chr2L', 'chr2R', 'chr3L', 'chr3R']].median(axis=1)
    
    # Permutation test for each chromosome to median Autosome
    pvals = list()
    cbn = list(combinations(prop_genes_on.columns, 2))
    for chrom in prop_genes_on.columns:
        _d1 = prop_genes_on[chrom]
        _d2 = _med
        
        obs = statistic(_d1, _d2)
        perms = draw_perm_reps(_d1, _d2, func=statistic, size=1_000)
        pval = np.sum(perms >= obs) / len(perms)
        pvals.append((chrom, pval))
    
    return pvals


results = []
for (clus, rep), dd in clusters.groupby(['cluster', 'rep']):
    _dat = norm.reindex(columns=dd.index)
    for res in _run2(clus, rep, _dat):
        results.append([clus, rep, *res])

df = pd.DataFrame(results, columns=['cluster', 'rep', 'chrom', 'p-value'])
df['q-value'] = multipletests(df['p-value'], method='fdr_bh')[1]
wide = df.pivot_table(index=['cluster', 'chrom'], columns='rep', values='q-value')

In [383]:
print('SP')
display(wide.loc['SP'])

print('MC')
display(wide.loc['MC'])

print('MS')
display(wide.loc['MS'])

SP


rep,rep1,rep2,rep3
chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr2L,0.347426,0.783314,0.647349
chr2R,1.0,1.0,0.926075
chr3L,1.0,1.0,0.5175
chr3R,1.0,0.334385,0.461554
chrX,0.913636,0.913636,0.077553


MC


rep,rep1,rep2,rep3
chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr2L,0.361268,0.763941,0.009643
chr2R,0.0,0.0,0.0
chr3L,0.0,0.027,0.0
chr3R,0.09934,0.592313,0.07875
chrX,0.0,0.079412,0.0


MS


rep,rep1,rep2,rep3
chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr2L,0.064125,0.641667,0.0
chr2R,1.0,1.0,0.926075
chr3L,0.0135,0.826319,0.005
chr3R,1.0,1.0,1.0
chrX,0.0,0.0,0.0


In [384]:
clus, rep

('PC', 'rep3')

In [385]:
prop_genes_on = (_dat > 0).join(chroms).groupby('chrom').agg(prop_on).T


In [386]:
kruskal(*prop_genes_on.values.T)

KruskalResult(statistic=12.972667378744372, pvalue=0.011410121053302264)

In [387]:
friedmanchisquare(*prop_genes_on.values.T)

FriedmanchisquareResult(statistic=192.88049313978993, pvalue=1.2742993323540951e-40)

In [373]:
clusters

Unnamed: 0_level_0,cluster,rep
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
rep1_AAACCTGAGACAAGCC,MC,rep1
rep1_AAACCTGAGCTTATCG,MC,rep1
rep1_AAACCTGGTGTAATGA,EC,rep1
rep1_AAACCTGGTTCATGGT,MS,rep1
rep1_AAACCTGTCCCTCTTT,MC,rep1
rep1_AAACGGGAGACTGGGT,LC,rep1
rep1_AAACGGGCACAGTCGC,LS,rep1
rep1_AAACGGGGTCGCGTGT,LS,rep1
rep1_AAACGGGGTGTTAAGA,EC,rep1
rep1_AAACGGGTCGTCCAGG,MC,rep1
