In [89]:
import os
import sys
from pathlib import Path
from tempfile import NamedTemporaryFile

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
from larval_gonad.notebook import Nb
from larval_gonad.gene_ontology import run_fly, run_flyslim

In [2]:
# Setup notebook
nbconfig = Nb.setup_notebook(seurat_dir='../scrnaseq-wf/data/scrnaseq_combine_force')

last updated: 2018-08-03 
Git hash: 64534f58712a8f53dd67f4d600a0bffd8919a9fc


In [3]:
background = nbconfig.seurat.get_normalized_read_counts().index.unique().tolist()

In [86]:
biomarkers = nbconfig.seurat.get_biomarkers('res.0.6')

def analyze_cluster(cluster, func):
    cluster_genes = biomarkers.query(f'cluster == {cluster} & p_val_adj <= 0.01').index.tolist()
    return func(cluster_genes, background, cutoff=0.05, return_obj=True)

In [87]:
dfs = []
for cluster in range(12):
    results, goeaobj = analyze_cluster(cluster, run_flyslim)
    with NamedTemporaryFile() as tmp:
        goeaobj.wr_tsv(tmp.name, results)
        df = pd.read_csv(tmp.name, sep='\t')
    df.drop('study_items', inplace=True, axis=1)
    df = df.assign(cluster=cluster)
    dfs.append(df)

go_results = pd.concat(dfs).set_index('# GO')
go_results.cluster = go_results.cluster.replace(nbconfig.cluster_annot)
summary = go_results.groupby('name')['cluster'].value_counts().to_frame().drop('cluster', axis=1).reset_index(level=1)

go_results.to_csv('../output/2018-08-03_go_slim_analysis_results.tsv', sep='\t')
summary.to_csv('../output/2018-08-03_go_slim_analysis_summary.tsv', sep='\t')

fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of 14,476 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
   565 out of    736 study items found in association
Running multitest correction: statsmodels fdr_bh
  113 GO terms are associated with 565 of 736 study items
  143 GO terms are associated with 11,166 of 14,476 population items
     37 items WROTE: /tmp/tmpu7is_6cf
fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of 14,476 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
   310 out of    348 study items found in association
Running multitest correction: statsmodels fdr_bh
  115 GO terms are associated with 310 of 348 study items
  143 GO terms are associated with 11,166 of 14,476 population items
     30 items WROTE: /tmp/tmpb60_hxpt
fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of 14,

In [90]:
dfs = []
for cluster in range(12):
    results, goeaobj = analyze_cluster(cluster, run_fly)
    with NamedTemporaryFile() as tmp:
        goeaobj.wr_tsv(tmp.name, results)
        df = pd.read_csv(tmp.name, sep='\t')
    df.drop('study_items', inplace=True, axis=1)
    df = df.assign(cluster=cluster)
    dfs.append(df)

go_results = pd.concat(dfs).set_index('# GO')
go_results.cluster = go_results.cluster.replace(nbconfig.cluster_annot)
summary = go_results.groupby('name')['cluster'].value_counts().to_frame().drop('cluster', axis=1).reset_index(level=1)

go_results.to_csv('../output/2018-08-03_go_analysis_results.tsv', sep='\t')
summary.to_csv('../output/2018-08-03_go_analysis_summary.tsv', sep='\t')

fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of 14,476 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
   565 out of    736 study items found in association
Running multitest correction: statsmodels fdr_bh
  682 GO terms are associated with 565 of 736 study items
  7,609 GO terms are associated with 11,166 of 14,476 population items
     27 items WROTE: /tmp/tmpief4eeuq
fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of 14,476 population items found in association
Calculating uncorrected p-values using fisher_scipy_stats
   310 out of    348 study items found in association
Running multitest correction: statsmodels fdr_bh
  992 GO terms are associated with 310 of 348 study items
  7,609 GO terms are associated with 11,166 of 14,476 population items
     39 items WROTE: /tmp/tmpfjdz078w
fisher module not installed.  Falling back on scipy.stats.fisher_exact
11,166 out of

In [91]:
go_results.head()

Unnamed: 0_level_0,NS,enrichment,name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_fdr_bh,cluster
# GO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GO:0008150,BP,e,biological_process,252/736,1647/14476,1.5271529999999998e-64,0,252,1.1620110000000001e-60,Late 1º Spermatocytes (0)
GO:0006123,BP,e,"mitochondrial electron transport, cytochrome c...",8/736,15/14476,2.012627e-07,8,8,0.0002552346,Late 1º Spermatocytes (0)
GO:0006355,BP,p,"regulation of transcription, DNA-templated",0/736,283/14476,6.894252e-07,9,0,0.0006137969,Late 1º Spermatocytes (0)
GO:0035971,BP,e,peptidyl-histidine dephosphorylation,4/736,4/14476,6.630571e-06,8,4,0.00280289,Late 1º Spermatocytes (0)
GO:0007291,BP,e,sperm individualization,11/736,51/14476,4.011487e-05,4,11,0.01346114,Late 1º Spermatocytes (0)


In [92]:
summary.head()

Unnamed: 0_level_0,cluster
name,Unnamed: 1_level_1
'de novo' protein folding,Spermatogonia (6)
"4 iron, 4 sulfur cluster binding",Unknown (9)
7S RNA binding,Spermatogonia (6)
ACF complex,Spermatogonia (6)
ATP binding,Late 1º Spermatocytes (0)


In [93]:
summary

Unnamed: 0_level_0,cluster
name,Unnamed: 1_level_1
'de novo' protein folding,Spermatogonia (6)
"4 iron, 4 sulfur cluster binding",Unknown (9)
7S RNA binding,Spermatogonia (6)
ACF complex,Spermatogonia (6)
ATP binding,Late 1º Spermatocytes (0)
ATP binding,Spermatogonia (6)
ATP binding,Unknown (11)
ATP hydrolysis coupled proton transport,Early Cyst Cells (5)
ATP hydrolysis coupled proton transport,Late Cyst Cells (4)
ATP hydrolysis coupled proton transport,Mid Cyst Cells (1)


In [95]:
summary.index.unique().shape

(775,)