In [1]:
import pandas as pd
import os

In [3]:
pd.set_option('display.max_colwidth', None)

Estimated expression levels from RSEM as a tsv file. The columns are as follows:

column 1: gene_id - gene name of the gene the transcript belongs to (parent gene). If no gene information is provided, gene_id and transcript_id is the same.  
column 2: transcript_id(s) - transcript name of this transcript  
column 3: length - the transcript's sequence length (poly(A) tail is not counted)  
column 4: effective_length - the length containing only the positions that can generate a valid fragment  
column 5: expected_count - the sum of the posterior probability of each read comes from this transcript over all reads  
column 6: TPM - transcripts per million, a measure of relative measure of transcript abundance  
column 7: FPKM - fragments per kilobase of transcript per million mapped reads, another relative measure of transcript abundance  
column 8: posterior_mean_count - posterior mean estimate calcualted by RSEM's Gibbs sampler  
column 9: posterior_standard_deviation_of_count - posterior standard deviation of counts  
column 10: pme_TPM - posterior mean estimate of TPM  
column 11: pme_FPKM - posterior mean estimate of FPKM  
column 12: TPM_ci_lower_bound - lower bound of 95% credibility interval for TPM values  
column 13: TPM_ci_upper_bound - upper bound of 95% credibility interval for TPM values  
column 14: FPKM_ci_lower_bound - lower bound of 95% credibility interval for FPKM values  
column 15: FPKM_ci_upper_bound - upper bound of 95% credibility interval for FPKM values  

In [4]:
# load Mouse ENCODE E11.5 data
limb_1 = pd.read_csv("../BEHST/data/ENCFF195JHC.tsv", sep='\t', header=0)
limb_2 = pd.read_csv("../BEHST/data/ENCFF457ZGF.tsv", sep='\t',header=0)
forebrain_1 = pd.read_csv("../BEHST/data/ENCFF465SNB.tsv", sep='\t', header=0)
forebrain_2 = pd.read_csv("../BEHST/data/ENCFF976OLT.tsv", sep='\t', header=0)
midbrain_1 = pd.read_csv("../BEHST/data/ENCFF359ZOA.tsv", sep='\t', header=0)
midbrain_2 = pd.read_csv("../BEHST/data/ENCFF971KZC.tsv", sep='\t', header=0)
hindbrain_1 = pd.read_csv("../BEHST/data/ENCFF750FTK.tsv", sep='\t', header=0)
hindbrain_2 = pd.read_csv("../BEHST/data/ENCFF109HTF.tsv", sep='\t', header=0)
heart_1 = pd.read_csv("../BEHST/data/ENCFF226IWR.tsv", sep='\t', header=0)
heart_2 = pd.read_csv("../BEHST/data/ENCFF540EJL.tsv", sep='\t', header=0)
liver_1 = pd.read_csv("../BEHST/data/ENCFF954EHG.tsv", sep='\t', header=0)
liver_2 = pd.read_csv("../BEHST/data/ENCFF523MEO.tsv", sep='\t', header=0)
neural_tube_1 = pd.read_csv("../BEHST/data/ENCFF375JDR.tsv", sep='\t', header=0)
neural_tube_2 = pd.read_csv("../BEHST/data/ENCFF298WHK.tsv", sep='\t', header=0)
facial_1 = pd.read_csv("../BEHST/data/ENCFF772UWT.tsv", sep='\t', header=0)
facial_2 = pd.read_csv("../BEHST/data/ENCFF262TXH.tsv", sep='\t', header=0)

In [5]:
# Ensemble orthologous genes
gene_ids = pd.read_csv("../BEHST/data/human_mouse_gene_id.txt", sep='\t')
gene_ids = gene_ids.rename(columns={'Mouse gene stable ID': 'gene_id'})

In [6]:
gene_ids.head()

Unnamed: 0,Gene stable ID,gene_id,Mouse gene name,Mouse homology type
0,ENSG00000198888,ENSMUSG00000064341,mt-Nd1,ortholog_one2one
1,ENSG00000198763,ENSMUSG00000064345,mt-Nd2,ortholog_one2one
2,ENSG00000198804,ENSMUSG00000064351,mt-Co1,ortholog_one2one
3,ENSG00000198712,ENSMUSG00000064354,mt-Co2,ortholog_one2one
4,ENSG00000228253,ENSMUSG00000064356,mt-Atp8,ortholog_one2one


#### Defining Tissue-specific Genes
Tissue-specific genes are defined using the algorithm from the HPA (Uhlén et al. 2015), and can be grouped as follows:

- Tissue Enriched: Genes with an expression level greater than 1 (TPM or FPKM) that also have at least five-fold higher expression levels in a particular tissue compared to all other tissues.
- Group Enriched: Genes with an expression level greater than 1 (TPM or FPKM) that also have at least five-fold higher expression levels in a group of 2-7 tissues compared to all other tissues, and that are not considered Tissue Enriched.
- Tissue Enhanced: Genes with an expression level greater than 1 (TPM or FPKM) that also have at least five-fold higher expression levels in a particular tissue compared to the average levels in all other tissues, and that are not considered Tissue Enriched or Group Enriched.



In [7]:
def calculate_mean(df_list):
    """
    Calculate the mean values of the isogenic replicates of gene quantifications.
    """
    df_all = pd.concat(df_list)
    df_mean = df_all.groupby(['gene_id']).mean()
    df_mean = df_mean.reset_index()
    
    # TODO: can drop unused columns here
    return df_mean

In [8]:
def select_genes(tissue_dfs, other_dfs):
    """
    select most highly expressed genes in this tissue
    """
    tissue_mean = calculate_mean(tissue_dfs)
    other_mean = calculate_mean(other_dfs)
    
    # select highly expressed genes in this tissue
    tissue_genes = tissue_mean[tissue_mean['pme_TPM'] > 1]
    
    # compare with other types
    tissue_enhanced = pd.merge(tissue_genes, other_mean, on='gene_id')
    tissue_enhanced = tissue_enhanced.rename(columns={'pme_TPM_x': 'tissue_TPM', 'pme_TPM_y': 'other_TPM'})
    # select useful columns
    tissue_enhanced = tissue_enhanced[['gene_id', 'tissue_TPM', 'other_TPM']]
    
    df_enhanced = tissue_enhanced[tissue_enhanced['tissue_TPM'] >= (5 * tissue_enhanced['other_TPM'])].copy()
    
    # convert to mouse stable id
    df_enhanced['gene_id'] = df_enhanced['gene_id'].str.split('.').str.get(0)
    
    return df_enhanced

In [9]:
def merge_df(go_df, ref_df, keys, sig):
    """
    go_df is the output GO term dataframe, ref_df is the ground-truth term dataframe,
    keys are list of columns to be merged on.
    Return a tuple of dataframes, where the first dataframe is true positive terms,
    the second dataframe is false negative terms.
    """
    # select significant terms
    if sig == True and 'p_value' in go_df.columns:
        go_df = go_df[go_df['p_value'] <= 0.05]
    
    res = go_df.merge(ref_df, on=keys, how='left', indicator=True)
    res_pos = res[res['_merge'] == 'both']
    res_neg = res[res['_merge'] == 'left_only']
    
    print("The number of true positive terms is", len(res_pos))
    print("The number of false positive terms is", len(res_neg))
    
    return res_pos, res_neg

In [10]:
def top_50(go_df, ref_df):
    # select top 50 terms by p-value
    top_50 = go_df.sort_values('p_value').head(50)
    
    # select columns to merge on
    if {'source', 'term_name'}.issubset(top_50.columns):
        top_res = merge_df(top_50, ref_df, ['term_id', 'source', 'term_name'], False)
    else:
        top_res = merge_df(top_50, ref_df, ['term_id'], False)
    
    return top_res

## BEHST Limb

In [11]:
# select most highly expressed genes in limb
limb_df = select_genes([limb_1, limb_2], 
                       [forebrain_1, forebrain_2, midbrain_1, midbrain_2, hindbrain_1, hindbrain_2, 
                       heart_1, heart_2, liver_1, liver_2, neural_tube_1, neural_tube_2, facial_1, facial_2])

len(limb_df)

97

In [12]:
# select genes with mappable id for homo sapiens
mapped = pd.merge(limb_df, gene_ids, on='gene_id', how='inner')
len(mapped)

# mapped[['Gene stable ID']].to_csv('../BEHST/data/06_24_reference_limb_gene_id', header=None, index=None)

69

In [13]:
mapped[mapped['Mouse homology type'] == 'ortholog_one2many']

Unnamed: 0,gene_id,tissue_TPM,other_TPM,Gene stable ID,Mouse gene name,Mouse homology type
67,ENSMUSG00000084897,8.655,1.140714,ENSG00000242950,Gm14226,ortholog_one2many


In [14]:
gene_ids[gene_ids['gene_id'] == 'ENSMUSG00000097052'].head(50)

Unnamed: 0,Gene stable ID,gene_id,Mouse gene name,Mouse homology type
31,ENSG00000280496,ENSMUSG00000097052,Snora43,ortholog_many2many
722,ENSG00000252461,ENSMUSG00000097052,Snora43,ortholog_many2many


In [15]:
# run gprofiler2 on the mapped genes, get this GO list as reference
limb_ref = pd.read_csv("../BEHST/results/06-24/limb_go_list", sep="\t", header=0)
len(limb_ref)

2336

In [16]:
# select significant terms
limb_ref = limb_ref[limb_ref.p_value <= 0.05]
limb_ref

Unnamed: 0,p_value,term_id,source,term_name
0,1.520378e-26,GO:0060173,GO:BP,limb development
1,1.520378e-26,GO:0048736,GO:BP,appendage development
2,5.502065e-23,GO:0035107,GO:BP,appendage morphogenesis
3,5.502065e-23,GO:0035108,GO:BP,limb morphogenesis
4,1.153630e-22,GO:0035113,GO:BP,embryonic appendage morphogenesis
...,...,...,...,...
173,3.137960e-02,GO:0044271,GO:BP,cellular nitrogen compound biosynthetic process
174,3.174007e-02,GO:0045445,GO:BP,myoblast differentiation
175,3.226027e-02,GO:0072006,GO:BP,nephron development
176,3.943211e-02,GO:0003676,GO:MF,nucleic acid binding


In [17]:
# read BEHST limb output GO terms
limb = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/bin/BEHST-results/vista_LIMB_sorted_bed_gProfiler_results_QUERY28000_TARGET18100_revGO_term_list_rand1591227782.behst", 
                   sep='\t', header=0)

limb_res = merge_df(limb, limb_ref, ['term_id', 'source', 'term_name'], True)
limb_res[0]

The number of true positive terms is 58
The number of false positive terms is 2


Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,1.150596e-10,GO:0001501,GO:BP,skeletal system development,1.423548e-19,both
1,2.388447e-09,GO:0048598,GO:BP,embryonic morphogenesis,1.207525e-15,both
2,5.034933e-08,GO:0048568,GO:BP,embryonic organ development,0.0001529934,both
3,1.89533e-07,GO:0048562,GO:BP,embryonic organ morphogenesis,2.089487e-06,both
4,3.258574e-07,GO:0003002,GO:BP,regionalization,1.752402e-11,both
5,1.15107e-06,GO:0009952,GO:BP,anterior/posterior pattern specification,5.286603e-08,both
6,1.785529e-06,GO:0009790,GO:BP,embryo development,3.135152e-12,both
7,1.796065e-06,GO:0048706,GO:BP,embryonic skeletal system development,2.814358e-09,both
8,2.147059e-06,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,both
9,2.480959e-06,GO:0007389,GO:BP,pattern specification process,1.859436e-13,both


In [18]:
limb_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
47,0.018272,GO:0002095,GO:CC,caveolar macromolecular signaling complex,,left_only
59,0.049652,GO:1902379,GO:MF,chemoattractant activity involved in axon guidance,,left_only


### Comparison with total shuffling and tss shuffling
#### limb total

In [19]:
limb_total = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/vista_LIMB_SHUFFLED/gprofiler2_output/vista_LIMB_SHUFFLED_bed_QUERY28000_TARGET18100_gprofiler_output", 
                   sep='\t', header=0)

limb_total = limb_total[['p_value', 'term_id', 'source', 'term_name']]
limb_total = limb_total[limb_total['p_value'] <= 0.05]
len(limb_total)

30

In [20]:
# intersect with reference
limb_total_res = merge_df(limb_total, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 1
The number of false positive terms is 29


In [21]:
limb_total_res[0]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
10,0.002825,GO:0009888,GO:BP,tissue development,5.856012e-11,both


In [22]:
limb_total_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,6.026773e-08,GO:0070268,GO:BP,cornification,,left_only
1,1.333986e-06,GO:0031424,GO:BP,keratinization,,left_only
2,2.874554e-06,GO:0030216,GO:BP,keratinocyte differentiation,,left_only
3,5.313839e-05,GO:0009913,GO:BP,epidermal cell differentiation,,left_only
4,0.0003998853,GO:0010896,GO:BP,regulation of triglyceride catabolic process,,left_only
5,0.0005903601,GO:0043588,GO:BP,skin development,,left_only
6,0.0006546098,GO:0090207,GO:BP,regulation of triglyceride metabolic process,,left_only
7,0.0008311337,GO:0008544,GO:BP,epidermis development,,left_only
8,0.001687502,GO:0030855,GO:BP,epithelial cell differentiation,,left_only
9,0.002042889,GO:0010898,GO:BP,positive regulation of triglyceride catabolic process,,left_only


#### limb tss

In [23]:
limb_tss = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/randomlyShuffledChromWide_vista_LIMB_sorted/gprofiler2_output/randomlyShuffledChromWide_vista_LIMB_sorted_bed_QUERY28000_TARGET18100_gprofiler_output", 
                   sep='\t', header=0)

limb_tss = limb_tss[['p_value', 'term_id', 'source', 'term_name']]
limb_tss = limb_tss[limb_tss['p_value'] <= 0.05]
len(limb_tss)

10

In [24]:
# intersect with reference
limb_tss_res = merge_df(limb_tss, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 0
The number of false positive terms is 10


In [25]:
limb_tss_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,0.035037,GO:0070098,GO:BP,chemokine-mediated signaling pathway,,left_only
1,2.6e-05,GO:0016493,GO:MF,C-C chemokine receptor activity,,left_only
2,3.5e-05,GO:0019957,GO:MF,C-C chemokine binding,,left_only
3,5.9e-05,GO:0001637,GO:MF,G protein-coupled chemoattractant receptor activity,,left_only
4,5.9e-05,GO:0004950,GO:MF,chemokine receptor activity,,left_only
5,0.00027,GO:0019956,GO:MF,chemokine binding,,left_only
6,0.032747,GO:0008528,GO:MF,G protein-coupled peptide receptor activity,,left_only
7,0.043529,GO:0001653,GO:MF,peptide receptor activity,,left_only
8,0.049614,GO:0035717,GO:MF,chemokine (C-C motif) ligand 7 binding,,left_only
9,0.049614,GO:0071791,GO:MF,chemokine (C-C motif) ligand 5 binding,,left_only


### Worst parameters

- From the summation matrix, the worst parameters are query extension = 100, target extension = 6100  
- From the VISTA limb data, the worst parameters are query extension = 3200, target extension = 3100
- From the VISTA limb data, the best parameters are query extension = 9400, target extension = 6100

### Query extension = 100, Target extension = 100

In [26]:
limb_100_100 = pd.read_csv("../BEHST/results/07-07/vista_LIMB_sorted_100_100_go_list", sep='\t', header=0)

In [27]:
limb_100_100_res = merge_df(limb_100_100, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 41
The number of false positive terms is 1


In [28]:
top_50_100_100 = top_50(limb_100_100, limb_ref)

The number of true positive terms is 47
The number of false positive terms is 3


### Query extension = 3200, Target extension = 3100

In [29]:
limb_3200_3100 = pd.read_csv("../BEHST/results/07-07/vista_LIMB_sorted_3200_3100_go_list", sep='\t', header=0)

In [30]:
limb_3200_3100_res = merge_df(limb_3200_3100, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 41
The number of false positive terms is 1


In [31]:
top_50_3200_3100 = top_50(limb_3200_3100, limb_ref)

The number of true positive terms is 49
The number of false positive terms is 1


### Query extension = 100, Target extension = 6100 (Worst Parameter)

In [32]:
limb_100_6100 = pd.read_csv("../BEHST/results/07-07/vista_LIMB_sorted_100_6100_go_list", sep='\t', header=0)

limb_100_6100_res = merge_df(limb_100_6100, limb_ref, ['term_id', 'source', 'term_name'], True)
limb_100_6100_res[0]

The number of true positive terms is 51
The number of false positive terms is 2


Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,4.196831e-09,GO:0048562,GO:BP,embryonic organ morphogenesis,2.089487e-06,both
1,2.048691e-08,GO:0048568,GO:BP,embryonic organ development,0.0001529934,both
2,6.303988e-08,GO:0048598,GO:BP,embryonic morphogenesis,1.207525e-15,both
3,8.141268e-08,GO:0048704,GO:BP,embryonic skeletal system morphogenesis,4.630908e-09,both
4,2.441219e-07,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,both
5,7.010824e-07,GO:0003700,GO:MF,DNA-binding transcription factor activity,7.006285e-14,both
6,8.146066e-07,GO:0048706,GO:BP,embryonic skeletal system development,2.814358e-09,both
7,2.507476e-06,GO:0009952,GO:BP,anterior/posterior pattern specification,5.286603e-08,both
8,2.866713e-06,GO:0001501,GO:BP,skeletal system development,1.423548e-19,both
9,4.550185e-06,GO:0140110,GO:MF,transcription regulator activity,9.129339e-11,both


In [33]:
limb_100_6100_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
46,0.029729,GO:0060065,GO:BP,uterus development,,left_only
48,0.036276,GO:0042101,GO:CC,T cell receptor complex,,left_only


In [34]:
top_50_100_6100 = top_50(limb_100_6100, limb_ref)

The number of true positive terms is 48
The number of false positive terms is 2


### Query extension = 9400, Target extension = 6100

In [35]:
limb_9400_6100 = pd.read_csv("../BEHST/results/07-07/vista_LIMB_sorted_9400_6100_go_list", sep='\t', header=0)

limb_9400_6100_res = merge_df(limb_9400_6100, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 75
The number of false positive terms is 2


In [36]:
top_50_9400_6100 = top_50(limb_9400_6100, limb_ref)

The number of true positive terms is 50
The number of false positive terms is 0


In [37]:
# check overlapped terms

limb_100_6100_pos = limb_100_6100_res[0][['p_value_x', 'term_id', 'source', 'term_name']]
limb_pos = limb_res[0][['p_value_x', 'term_id', 'source', 'term_name']]
merged = merge_df(limb_100_6100_pos, limb_pos, ['term_id', 'source', 'term_name'], True)
merged[0]

The number of true positive terms is 48
The number of false positive terms is 3


Unnamed: 0,p_value_x_x,term_id,source,term_name,p_value_x_y,_merge
0,4.196831e-09,GO:0048562,GO:BP,embryonic organ morphogenesis,1.89533e-07,both
1,2.048691e-08,GO:0048568,GO:BP,embryonic organ development,5.034933e-08,both
2,6.303988e-08,GO:0048598,GO:BP,embryonic morphogenesis,2.388447e-09,both
3,8.141268e-08,GO:0048704,GO:BP,embryonic skeletal system morphogenesis,1.740721e-05,both
4,2.441219e-07,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",2.147059e-06,both
5,7.010824e-07,GO:0003700,GO:MF,DNA-binding transcription factor activity,3.346897e-05,both
6,8.146066e-07,GO:0048706,GO:BP,embryonic skeletal system development,1.796065e-06,both
7,2.507476e-06,GO:0009952,GO:BP,anterior/posterior pattern specification,1.15107e-06,both
8,2.866713e-06,GO:0001501,GO:BP,skeletal system development,1.150596e-10,both
9,4.550185e-06,GO:0140110,GO:MF,transcription regulator activity,0.001515828,both


In [38]:
merged[1]

Unnamed: 0,p_value_x_x,term_id,source,term_name,p_value_x_y,_merge
28,0.001082,GO:0003677,GO:MF,DNA binding,,left_only
45,0.0249,GO:0043009,GO:BP,chordate embryonic development,,left_only
50,0.045538,GO:0006355,GO:BP,"regulation of transcription, DNA-templated",,left_only


### Conclusion
- Optimized parameters return a larger number of significant GO terms. The number of false positive terms is similar from inquiries using different parameters, so the precision using optimized parameters is higher.

## Reference GO list from all tissues

In [39]:
all_ave = calculate_mean([limb_1, limb_2, forebrain_1, forebrain_2, midbrain_1, midbrain_2, 
                          hindbrain_1, hindbrain_2, 
                       heart_1, heart_2, liver_1, liver_2, neural_tube_1, neural_tube_2, facial_1, facial_2])

all_ave = all_ave[all_ave.pme_TPM >= 1]
all_ave['gene_id'] = all_ave['gene_id'].str.split('.').str.get(0)

# number of expressed genes
len(all_ave)

18704

In [40]:
all_mapped = pd.merge(all_ave, gene_ids, on='gene_id', how='inner')

# number of mapped genes
len(all_mapped)

14897

In [41]:
# all_mapped[['Gene stable ID']].to_csv('../BEHST/data/06_24_reference_gene_id_all', header=None, index=None)

In [42]:
all_ref = pd.read_csv("../BEHST/results/06-24/go_list_all", sep="\t", header=0)
all_ref = all_ref[all_ref.p_value <= 0.05]
all_ref

Unnamed: 0,p_value,term_id,source,term_name
0,4.940656e-324,GO:0043231,GO:CC,intracellular membrane-bounded organelle
1,4.940656e-324,GO:0043227,GO:CC,membrane-bounded organelle
2,4.940656e-324,GO:0005622,GO:CC,intracellular
3,4.940656e-324,GO:0005737,GO:CC,cytoplasm
4,4.940656e-324,GO:0043226,GO:CC,organelle
...,...,...,...,...
1913,4.925656e-02,GO:0006904,GO:BP,vesicle docking involved in exocytosis
1914,4.925656e-02,GO:0051985,GO:BP,negative regulation of chromosome segregation
1915,4.925656e-02,GO:0021575,GO:BP,hindbrain morphogenesis
1916,4.925656e-02,GO:0051293,GO:BP,establishment of spindle localization


## GREAT: intersect with significant terms in reference list

#### result downloaded from GREAT web query
Test set:    
vista_LIMB_sorted_EDITED_FOR_GREAT.bed (243 genomic regions)  

Background:  
Whole genome background

Assembly:  
Human: GRCh37 (UCSC hg19, Feb. 2009)          

Associated genomic regions:  
Basal+extension (constitutive 5.0 kb upstream and 1.0 kb downstream, up to 1000.0 kb max extension). Curated regulatory domains are included.  
6 of all 243 genomic regions (2.5%) are not associated with any genes. 

Statistical significance: FDR  
threshold: 0.05  
View: siginificant by both

### FDR

In [43]:
limb_great = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/06-12/06-12-shown-MultipleOntologies.tsv", sep='\t', header=1)
limb_great = limb_great.reset_index()
#limb_great = limb_great[['index', '# Ontology', ' Term Name ', ' Binom Raw P-Value ', '  Binom FDR Q-Val  ', 
#                         '  Hyper FDR Q-Val  ']]
limb_great = limb_great[['index', '# Ontology', ' Term Name ', ' Binom Rank ',
                         ' Binom Raw P-Value ', ' Hyper Rank ']]
limb_great.columns = ['ontology', 'term name', 'term_id', 'binom_raw_p_value', 'binom_FDR_q_value',
                      'hyper_FDR_q_value']

In [44]:
great_res = merge_df(limb_great[['ontology', 'term_id', 'binom_raw_p_value', 'term name']], limb_ref, ['term_id'], 
                     True)
great_res[0][['term_id', 'binom_raw_p_value', 'term_name', 'p_value', 'source', '_merge']]

The number of true positive terms is 8
The number of false positive terms is 18


Unnamed: 0,term_id,binom_raw_p_value,term_name,p_value,source,_merge
8,GO:0035295,3.963168e-10,tube development,0.001142134,GO:BP,both
11,GO:0048598,8.090736e-10,embryonic morphogenesis,1.207525e-15,GO:BP,both
14,GO:0048562,1.067187e-08,embryonic organ morphogenesis,2.089487e-06,GO:BP,both
15,GO:0048568,1.181012e-08,embryonic organ development,0.0001529934,GO:BP,both
17,GO:0009790,2.616653e-08,embryo development,3.135152e-12,GO:BP,both
19,GO:0035239,5.610827e-08,tube morphogenesis,0.01439291,GO:BP,both
20,GO:0003700,1.68947e-09,DNA-binding transcription factor activity,7.006285e-14,GO:MF,both
21,GO:0000981,1.177067e-07,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,GO:MF,both


In [45]:
great_res[1][['ontology', 'term_id', 'binom_raw_p_value', 'term name', '_merge']]

Unnamed: 0,ontology,term_id,binom_raw_p_value,term name,_merge
0,GO Biological Process,GO:0010558,1.969e-11,negative regulation of macromolecule biosynthetic process,left_only
1,GO Biological Process,GO:0031327,3.650388e-11,negative regulation of cellular biosynthetic process,left_only
2,GO Biological Process,GO:2000113,5.260388e-11,negative regulation of cellular macromolecule biosynthetic process,left_only
3,GO Biological Process,GO:0009890,5.767215e-11,negative regulation of biosynthetic process,left_only
4,GO Biological Process,GO:1903507,7.676659e-11,negative regulation of nucleic acid-templated transcription,left_only
5,GO Biological Process,GO:1902679,1.048514e-10,negative regulation of RNA biosynthetic process,left_only
6,GO Biological Process,GO:0045892,2.447997e-10,"negative regulation of transcription, DNA-templated",left_only
7,GO Biological Process,GO:0051172,3.208091e-10,negative regulation of nitrogen compound metabolic process,left_only
9,GO Biological Process,GO:0010629,4.168727e-10,negative regulation of gene expression,left_only
10,GO Biological Process,GO:0051253,5.461104e-10,negative regulation of RNA metabolic process,left_only


### Top 50 terms

In [46]:
great_all = pd.read_csv("../BEHST/data/limb_GREAT_res_all", sep='\t', header=0)

In [47]:
great_all_top = great_all.sort_values('BinomP').head(50)
great_all_top = great_all_top[['ID']]
great_all_top.columns = ['term_id']

In [48]:
great_all_res = merge_df(great_all_top, limb_ref, ['term_id'], True)

The number of true positive terms is 33
The number of false positive terms is 17


## GREAT-gprofiler Hybrid test

In [49]:
great_gene_list = []
for i in range(len(great_all)):
    curr_list = great_all.iloc[i, 9]
    curr_list = curr_list.split(',')
    great_gene_list.extend(curr_list)

In [50]:
great_gene_set = set(great_gene_list)

In [51]:
great_gene_df = pd.DataFrame(great_gene_set)
# great_gene_df.to_csv('../BEHST/data/great_limb_gene_id', header=None, index=None)

In [52]:
great_gprofiler = pd.read_csv("../BEHST/results/06-25/limb_great_go_list", sep='\t', header=0)
great_gprofiler_sig = great_gprofiler[great_gprofiler.p_value <= 0.05]
len(great_gprofiler_sig)

320

### FDR

In [53]:
great_gprofiler_res = merge_df(great_gprofiler_sig, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 139
The number of false positive terms is 181


In [54]:
great_gprofiler_res[0]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,2.736283e-17,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,both
1,1.811661e-15,GO:0003700,GO:MF,DNA-binding transcription factor activity,7.006285e-14,both
2,7.498709e-14,GO:0140110,GO:MF,transcription regulator activity,9.129339e-11,both
3,8.925696e-14,GO:0006357,GO:BP,regulation of transcription by RNA polymerase II,1.969506e-09,both
4,7.054216e-12,GO:0006366,GO:BP,transcription by RNA polymerase II,1.246135e-08,both
...,...,...,...,...,...,...
281,2.281995e-02,GO:0009893,GO:BP,positive regulation of metabolic process,3.021835e-04,both
294,2.790377e-02,GO:0048522,GO:BP,positive regulation of cellular process,5.876816e-03,both
295,2.828129e-02,GO:0007517,GO:BP,muscle organ development,5.580137e-06,both
296,2.890273e-02,GO:0002062,GO:BP,chondrocyte differentiation,3.836478e-05,both


In [55]:
great_gprofiler_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
10,2.667935e-09,GO:0021537,GO:BP,telencephalon development,,left_only
12,4.380492e-09,GO:0048699,GO:BP,generation of neurons,,left_only
18,8.161508e-09,GO:0007399,GO:BP,nervous system development,,left_only
20,9.398389e-09,GO:0030900,GO:BP,forebrain development,,left_only
22,1.212050e-08,GO:0007507,GO:BP,heart development,,left_only
...,...,...,...,...,...,...
314,4.505809e-02,GO:0035270,GO:BP,endocrine system development,,left_only
315,4.558823e-02,GO:0072087,GO:BP,renal vesicle development,,left_only
316,4.681536e-02,GO:0061311,GO:BP,cell surface receptor signaling pathway involved in heart development,,left_only
317,4.681536e-02,GO:0003338,GO:BP,metanephros morphogenesis,,left_only


### Top 50 terms

In [56]:
great_top_50 = great_gprofiler_sig.sort_values('p_value').head(50)

great_top_res = merge_df(great_top_50, limb_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 39
The number of false positive terms is 11


## ChIP-Enrich

In [57]:
limb_chip = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/07-02/vista_LIMB_sorted_chip_enrich_peaks.tab", sep='\t', header=0)

limb_chip.head()

Unnamed: 0,peak_id,chr,peak_start,peak_end,gene_id,gene_symbol,gene_locus_start,gene_locus_end,nearest_tss,dist_to_tss,nearest_tss_gene_id,nearest_tss_symbol,nearest_tss_gene_strand
0,peak:1,chr1,8130440,8131887,54206,ERRFI1,8048734,8235391,8086393,-44769,54206,ERRFI1,-
1,peak:2,chr1,44989825,44991149,100847089,MIR5584,44950331,45054434,45011165,-20677,100847089,MIR5584,+
2,peak:3,chr1,51165196,51166786,11124,FAF1,50983645,51429772,51078170,-87820,11124,FAF1,-
3,peak:4,chr1,54925047,54928826,23648,SSBP3,54768955,54939999,54872068,-54867,23648,SSBP3,-
4,peak:5,chr1,56240105,56242435,100616272,MIR4422,55686177,56368285,55691314,549955,100616272,MIR4422,+


In [58]:
# limb_chip[['gene_symbol']].to_csv('../BEHST/data/07_02_limb_chip_enrich_genes', header=None, index=None)

In [59]:
limb_chip_go = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/07-02/vista_LIMB_sorted_chip_enrich_results.tab", sep='\t', header=0)

In [60]:
limb_chip_go = limb_chip_go[['Geneset.Type', 'Geneset.ID', 'Description', 'P.value']]
limb_chip_go.columns = ['source', 'term_id', 'term_name', 'p_value']
limb_chip_go = limb_chip_go[limb_chip_go.p_value <= 0.05]
limb_chip_go

Unnamed: 0,source,term_id,term_name,p_value
0,Gene Ontology Biological Process,GO:0071470,cellular response to osmotic stress,2.282953e-07
1,Gene Ontology Biological Process,GO:0009952,anterior/posterior pattern specification,1.573778e-06
2,Gene Ontology Biological Process,GO:0055075,potassium ion homeostasis,2.790453e-06
3,Gene Ontology Biological Process,GO:0021532,neural tube patterning,3.951001e-06
4,Gene Ontology Biological Process,GO:0035295,tube development,1.201910e-05
...,...,...,...,...
2652,Gene Ontology Molecular Function,GO:0005102,receptor binding,3.083794e-02
2653,Gene Ontology Molecular Function,GO:0035639,purine ribonucleoside triphosphate binding,3.276439e-02
2654,Gene Ontology Molecular Function,GO:0030554,adenyl nucleotide binding,3.851528e-02
2655,Gene Ontology Molecular Function,GO:0032559,adenyl ribonucleotide binding,3.915167e-02


In [61]:
chip_res = merge_df(limb_chip_go[['term_id', 'p_value']], limb_ref, ['term_id'], True)

The number of true positive terms is 67
The number of false positive terms is 441


In [62]:
chip_res[1]

Unnamed: 0,term_id,p_value_x,p_value_y,source,term_name,_merge
0,GO:0071470,2.282953e-07,,,,left_only
2,GO:0055075,2.790453e-06,,,,left_only
3,GO:0021532,3.951001e-06,,,,left_only
5,GO:0030315,1.350352e-05,,,,left_only
6,GO:1902188,1.378139e-05,,,,left_only
...,...,...,...,...,...,...
503,GO:0005102,3.083794e-02,,,,left_only
504,GO:0035639,3.276439e-02,,,,left_only
505,GO:0030554,3.851528e-02,,,,left_only
506,GO:0032559,3.915167e-02,,,,left_only


In [63]:
limb_chip_top = limb_chip_go[['term_id', 'p_value']].head(50)
limb_chip_top_res = merge_df(limb_chip_top, limb_ref, ['term_id'], True)
limb_chip_top_res[0]

The number of true positive terms is 10
The number of false positive terms is 40


Unnamed: 0,term_id,p_value_x,p_value_y,source,term_name,_merge
1,GO:0009952,2e-06,5.286603e-08,GO:BP,anterior/posterior pattern specification,both
4,GO:0035295,1.2e-05,0.001142134,GO:BP,tube development,both
10,GO:0048568,1.7e-05,0.0001529934,GO:BP,embryonic organ development,both
13,GO:0048562,3.8e-05,2.089487e-06,GO:BP,embryonic organ morphogenesis,both
19,GO:0003002,7.7e-05,1.752402e-11,GO:BP,regionalization,both
24,GO:0048598,9.1e-05,1.207525e-15,GO:BP,embryonic morphogenesis,both
25,GO:0035136,9.5e-05,3.972126e-09,GO:BP,forelimb morphogenesis,both
34,GO:0051254,0.000168,1.011132e-05,GO:BP,positive regulation of RNA metabolic process,both
44,GO:0043565,0.000229,7.987161e-13,GO:MF,sequence-specific DNA binding,both
48,GO:0003700,0.000257,7.006285e-14,GO:MF,DNA-binding transcription factor activity,both


## ChIP-Enrich-gprofiler Hybrid test

In [64]:
limb_chip_gprofiler = pd.read_csv("/mnt/work1/users/hoffmangroup/zhiyuanl/BEHST/results/07-02/limb_chip_go_list", sep='\t', header=0)

In [65]:
limb_chip_gprofiler_res = merge_df(limb_chip_gprofiler, limb_ref, ['term_id', 'source', 'term_name'], True)
limb_chip_gprofiler_res[0]

The number of true positive terms is 52
The number of false positive terms is 28


Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,2.297519e-07,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,both
1,1.688737e-06,GO:0003700,GO:MF,DNA-binding transcription factor activity,7.006285e-14,both
2,1.796098e-05,GO:0140110,GO:MF,transcription regulator activity,9.129339e-11,both
3,2.513041e-05,GO:0006357,GO:BP,regulation of transcription by RNA polymerase II,1.969506e-09,both
5,0.0001070172,GO:0000977,GO:MF,RNA polymerase II regulatory region sequence-specific DNA binding,1.614064e-10,both
7,0.0001252904,GO:0001012,GO:MF,RNA polymerase II regulatory region DNA binding,1.905652e-10,both
9,0.0001984396,GO:0006366,GO:BP,transcription by RNA polymerase II,1.246135e-08,both
10,0.0003240722,GO:0000976,GO:MF,transcription regulatory region sequence-specific DNA binding,5.224864e-10,both
11,0.0003428032,GO:0000790,GO:CC,nuclear chromatin,1.702319e-14,both
12,0.0004104401,GO:0043565,GO:MF,sequence-specific DNA binding,7.987161e-13,both


In [66]:
limb_chip_gprofiler_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
4,6.9e-05,GO:0042383,GO:CC,sarcolemma,,left_only
6,0.000115,GO:0007399,GO:BP,nervous system development,,left_only
8,0.000176,GO:0090596,GO:BP,sensory organ morphogenesis,,left_only
13,0.000463,GO:0048468,GO:BP,cell development,,left_only
21,0.002096,GO:0007507,GO:BP,heart development,,left_only
23,0.00265,GO:0030182,GO:BP,neuron differentiation,,left_only
25,0.003272,GO:0048699,GO:BP,generation of neurons,,left_only
29,0.004779,GO:0022008,GO:BP,neurogenesis,,left_only
35,0.006113,GO:1903507,GO:BP,negative regulation of nucleic acid-templated transcription,,left_only
36,0.006298,GO:1902679,GO:BP,negative regulation of RNA biosynthetic process,,left_only


In [67]:
limb_chip_gp_top = limb_chip_gprofiler.head(50)
limb_chip_gp_res = merge_df(limb_chip_gp_top, limb_ref, ['term_id', 'term_name', 'source'], False)
limb_chip_gp_res[0]

The number of true positive terms is 36
The number of false positive terms is 14


Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
0,2.297519e-07,GO:0000981,GO:MF,"DNA-binding transcription factor activity, RNA polymerase II-specific",1.388936e-15,both
1,1.688737e-06,GO:0003700,GO:MF,DNA-binding transcription factor activity,7.006285e-14,both
2,1.796098e-05,GO:0140110,GO:MF,transcription regulator activity,9.129339e-11,both
3,2.513041e-05,GO:0006357,GO:BP,regulation of transcription by RNA polymerase II,1.969506e-09,both
5,0.0001070172,GO:0000977,GO:MF,RNA polymerase II regulatory region sequence-specific DNA binding,1.614064e-10,both
7,0.0001252904,GO:0001012,GO:MF,RNA polymerase II regulatory region DNA binding,1.905652e-10,both
9,0.0001984396,GO:0006366,GO:BP,transcription by RNA polymerase II,1.246135e-08,both
10,0.0003240722,GO:0000976,GO:MF,transcription regulatory region sequence-specific DNA binding,5.224864e-10,both
11,0.0003428032,GO:0000790,GO:CC,nuclear chromatin,1.702319e-14,both
12,0.0004104401,GO:0043565,GO:MF,sequence-specific DNA binding,7.987161e-13,both


In [68]:
limb_chip_gp_res[1]

Unnamed: 0,p_value_x,term_id,source,term_name,p_value_y,_merge
4,6.9e-05,GO:0042383,GO:CC,sarcolemma,,left_only
6,0.000115,GO:0007399,GO:BP,nervous system development,,left_only
8,0.000176,GO:0090596,GO:BP,sensory organ morphogenesis,,left_only
13,0.000463,GO:0048468,GO:BP,cell development,,left_only
21,0.002096,GO:0007507,GO:BP,heart development,,left_only
23,0.00265,GO:0030182,GO:BP,neuron differentiation,,left_only
25,0.003272,GO:0048699,GO:BP,generation of neurons,,left_only
29,0.004779,GO:0022008,GO:BP,neurogenesis,,left_only
35,0.006113,GO:1903507,GO:BP,negative regulation of nucleic acid-templated transcription,,left_only
36,0.006298,GO:1902679,GO:BP,negative regulation of RNA biosynthetic process,,left_only


## Result
### False Discovery Rate
experiment | significant GO terms | True Positive | False Positive | FDR 
--- | --- | --- | --- | --- 
BEHST | 60 | 58 | 2 | 0.033
GREAT | 26 | 8 | 18 | 0.692
GREAT-gprofiler | 320 | 139 | 181 | 0.566
ChIP-Enrich | 508 | 67 | 441 | 0.868
ChIP-Enrich-gprofiler | 80 | 52 | 28 | 0.350

### Top 50 terms
experiment | Intersection with reference | Intersection Rate
--- | --- | --- 
BEHST | 49 | 0.98
GREAT | 33 | 0.66
GREAT-gprofiler | 39 | 0.78
ChIP-Enrich | 10 | 0.20
ChIP-Enrich-gprofiler | 36 | 0.72

## DNase Hypersensitive Sites

In [97]:
# load DNase hypersensitive sites data
GM12878_1 = pd.read_csv("../BEHST/data/GM12878_gene_quantification_1.tsv", sep='\t', header=0)
GM12878_2 = pd.read_csv("../BEHST/data/GM12878_gene_quantification_2.tsv", sep='\t',header=0)
H1_1 = pd.read_csv("../BEHST/data/H1_gene_quantification_1.tsv", sep='\t', header=0)
H1_2 = pd.read_csv("../BEHST/data/H1_gene_quantification_2.tsv", sep='\t', header=0)
HelaS3_1 = pd.read_csv("../BEHST/data/HelaS3_gene_quantification_1.tsv", sep='\t', header=0)
HelaS3_2 = pd.read_csv("../BEHST/data/HelaS3_gene_quantification_2.tsv", sep='\t', header=0)
HepG2_1 = pd.read_csv("../BEHST/data/HepG2_gene_quantification_1.tsv", sep='\t', header=0)
HepG2_2 = pd.read_csv("../BEHST/data/HepG2_gene_quantification_2.tsv", sep='\t', header=0)
huvec_1 = pd.read_csv("../BEHST/data/huvec_gene_quantification_1.tsv", sep='\t', header=0)
huvec_2 = pd.read_csv("../BEHST/data/huvec_gene_quantification_2.tsv", sep='\t', header=0)
K562_1 = pd.read_csv("../BEHST/data/K562_gene_quantification_1.tsv", sep='\t', header=0)
K562_2 = pd.read_csv("../BEHST/data/K562_gene_quantification_2.tsv", sep='\t', header=0)

In [98]:
# select most highly expressed genes in GM12878
gm12878_df = select_genes([GM12878_1, GM12878_2], 
                       [H1_1, H1_2, HelaS3_1, HelaS3_2, HepG2_1, HepG2_2, 
                       huvec_1, huvec_2, K562_1, K562_2])

len(gm12878_df)

# gm12878_df[['gene_id']].to_csv('../BEHST/data/reference_gm12878_gene_id', header=None, index=None)

1659

In [99]:
gm12878_ref = pd.read_csv("../BEHST/results/07-08/gm12878_go_list", sep="\t", header=0)

gm12878_ref = gm12878_ref[gm12878_ref.p_value <= 0.05]
gm12878_ref

Unnamed: 0,p_value,term_id,source,term_name
0,3.155919e-88,GO:0006955,GO:BP,immune response
1,3.627306e-88,GO:0002376,GO:BP,immune system process
2,1.188706e-61,GO:0002682,GO:BP,regulation of immune system process
3,2.447031e-57,GO:0046649,GO:BP,lymphocyte activation
4,1.082126e-55,GO:0002684,GO:BP,positive regulation of immune system process
...,...,...,...,...
508,4.341709e-02,GO:0061756,GO:BP,leukocyte adhesion to vascular endothelial cell
509,4.577894e-02,GO:0050710,GO:BP,negative regulation of cytokine secretion
510,4.653474e-02,GO:0003924,GO:MF,GTPase activity
511,4.688218e-02,GO:0046425,GO:BP,regulation of receptor signaling pathway via JAK-STAT


In [100]:
gm12878_1 = pd.read_csv("../BEHST/results/07-08/GM12878_dnase_seq_hg19_1_bed_bed_QUERY28000_TARGET18100_8_types_go_list.txt", 
                   sep='\t', header=0)

gm12878_1_res = merge_df(gm12878_1, gm12878_ref, ['term_id', 'source', 'term_name'], True)

gm12878_2 = pd.read_csv("../BEHST/results/07-08/GM12878_dnase_seq_hg19_2_bed_bed_QUERY28000_TARGET18100_8_types_go_list.txt", 
                   sep='\t', header=0)

gm12878_2_res = merge_df(gm12878_2, gm12878_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 84
The number of false positive terms is 363
The number of true positive terms is 90
The number of false positive terms is 376


In [101]:
gm12878_1_hic1 = pd.read_csv("../BEHST/results/07-08/GM12878_dnase_seq_hg19_1_bed_QUERY28000_TARGET18100_go_list.txt", 
                   sep='\t', header=0)

gm12878_1_hic1_res = merge_df(gm12878_1_hic1, gm12878_ref, ['term_id', 'source', 'term_name'], True)


gm12878_2_hic1 = pd.read_csv("../BEHST/results/07-08/GM12878_dnase_seq_hg19_2_bed_QUERY28000_TARGET18100_go_list.txt", 
                   sep='\t', header=0)

gm12878_2_hic1_res = merge_df(gm12878_2_hic1, gm12878_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 62
The number of false positive terms is 170
The number of true positive terms is 61
The number of false positive terms is 165


In [102]:
# read BEHST limb output GO terms
gm12878_sub_1 = pd.read_csv("../BEHST/results/07-08/GM12878_subtracted_1_bed_QUERY28000_TARGET18100_8_types_go_list.txt", 
                   sep='\t', header=0)

gm12878_sub_1_res = merge_df(gm12878_sub_1, gm12878_ref, ['term_id', 'source', 'term_name'], True)


# read BEHST limb output GO terms
gm12878_sub_2 = pd.read_csv("../BEHST/results/07-08/GM12878_subtracted_2_bed_QUERY28000_TARGET18100_8_types_go_list.txt", 
                   sep='\t', header=0)

gm12878_sub_2_res = merge_df(gm12878_sub_2, gm12878_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 110
The number of false positive terms is 156
The number of true positive terms is 143
The number of false positive terms is 180


In [103]:
gm12878_sub_1_hic1 = pd.read_csv("../BEHST/results/07-08/GM12878_subtracted_1_bed_QUERY28000_TARGET18100_go_list.txt", 
                   sep='\t', header=0)

gm12878_sub_1_hic1_res = merge_df(gm12878_sub_1_hic1, gm12878_ref, ['term_id', 'source', 'term_name'], True)


gm12878_sub_2_hic1 = pd.read_csv("../BEHST/results/07-08/GM12878_subtracted_2_bed_QUERY28000_TARGET18100_go_list.txt", 
                   sep='\t', header=0)

gm12878_sub_2_hic1_res = merge_df(gm12878_sub_2_hic1, gm12878_ref, ['term_id', 'source', 'term_name'], True)

The number of true positive terms is 54
The number of false positive terms is 82
The number of true positive terms is 90
The number of false positive terms is 78
