In [1]:
import os 
import dask.dataframe as dd
import pandas as pd 
pd.get_option("display.max_columns", None)
pd.options.display.max_columns = None
import numpy as np
import glob
import json
from IPython.display import HTML
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")
outdir = 'results/main/sgls/combined/'
os.makedirs(outdir, exist_ok=True)

In [2]:
major_cols = [
 'gwas_source',
 'ge_source',
 'loop_source',
 'sid',
 'rsid',
 'geneid',
 'gene_name',
 'chrom',
 'snp_pos',
 'tss_start',
 'tss_end',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'eqtl_pval',
 'eqtl_fdr',
 'dist',
 'ppH4',
 'gene_start',
 'gene_end',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_pval',
 'gene_strand',
 'eqtl_source']

## Loading GENCODE information

In [3]:
gene_info = pd.read_table('results/refs/gencode/v30/gencode.v30.annotation.bed', header=None)
gene_dict = pd.concat([gene_info.iloc[:, [5,6]], gene_info.iloc[:, [6,5]]])
#{sr[5]: sr[6] for i, sr in gene_info.iterrows()}
gene_dict.drop_duplicates(inplace=True)
gene_dict.set_index(5, inplace=True)
gene_dict = gene_dict.squeeze()
gene_dict = gene_dict.to_dict()

## Checking the samplesheet

In [4]:
samplesheet = pd.read_table('config/sgl.samplesheet.tsv.v2')
template = 'results/main/sgls/T1D_34012112_Gaulton/{}/{}'
output_exists = []
for i, sr in samplesheet.iterrows():
    test = os.path.exists(template.format(sr.eqtl_db, sr.eqtl_origin, sr.loop_origin))
    output_exists.append(test)
samplesheet['analyzed'] = output_exists

In [5]:
samplesheet.sort_values(['analyzed', 'eqtl_db', 'eqtl_origin', 'loop_origin'], \
                       ascending=[False, True, True, True], inplace=True)

samplesheet.reset_index(drop=True, inplace=True)
samplesheet.columns = ['GWAS Source', 'eQTL Source', 'GE Source', 'Loop Source', 'Analyzed?']

In [6]:
samplesheet

Unnamed: 0,GWAS Source,eQTL Source,GE Source,Loop Source,Analyzed?
0,T1D_34012112_Gaulton,Quach_2016,monocyte_IAV,monocyte_naive,True
1,T1D_34594039_GCST90018925,Quach_2016,monocyte_IAV,monocyte_naive,True
2,T1D_32005708,Quach_2016,monocyte_IAV,monocyte_naive,True
3,T1D_25751624,Quach_2016,monocyte_IAV,monocyte_naive,True
4,T1D_34012112_Gaulton,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,True
5,T1D_34594039_GCST90018925,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,True
6,T1D_32005708,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,True
7,T1D_25751624,Quach_2016,monocyte_Pam3CSK4,monocyte_naive,True
8,T1D_34012112_Gaulton,Quach_2016,monocyte_R848,monocyte_naive,True
9,T1D_34594039_GCST90018925,Quach_2016,monocyte_R848,monocyte_naive,True


## Combining the other datasets

In [28]:
# loading and concat all the data
data = []
sgls_annos = [x for x in glob.glob('results/main/sgls/*/*/*/*/master.tsv') if 'sgls_20220222' in x]
sgls_annos = [x for x in glob.glob('results/main/sgls/*/*/*/*/master.tsv') if 'sgls_20220222' not in x]

In [29]:
sgls_annos

['results/main/sgls/T1D_32005708/Schmiedel_2018/CD4_T-cell_anti-CD3-CD28/CD4_T-cell_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/CD8_T-cell_naive/CD8_T-cell_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/NK-cell_naive/NK-cell_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/CD8_T-cell_anti-CD3-CD28/CD8_T-cell_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/B-cell_naive/B-cell_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/monocyte_CD16_naive/monocyte_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Schmiedel_2018/monocyte_naive/monocyte_naive/master.tsv',
 'results/main/sgls/T1D_32005708/Quach_2016/monocyte_IAV/monocyte_naive/master.tsv',
 'results/main/sgls/T1D_25751624/Schmiedel_2018/CD4_T-cell_naive/CD4_T-cell_naive/master.tsv',
 'results/main/sgls/T1D_25751624/Schmiedel_2018/monocyte_naive/monocyte_naive/master.tsv',
 'results/main/sgls/T1D_25751624/Schmiedel_2018/CD4_T-cell_anti-C

In [30]:
for fn in sgls_annos:
        
    gwas_source = fn.split('/')[3]
    eqtl_source = fn.split('/')[4]
    ge_source = fn.split('/')[5]
    loop_source = fn.split('/')[6]    
    
    df = pd.read_table(fn, header=0)
    df['gwas_source'] = gwas_source
    df['eqtl_source'] = eqtl_source
    df['ge_source'] = ge_source
    df['loop_source'] = loop_source
    
    # combine the columns gene_id and geneid
    #df.loc[~df['gene_id'].isna(), 'geneid'] = df.loc[~df['gene_id'].isna(), 'gene_id']
    #df.loc[:, 'gene_name'] = df.loc[:, 'geneid'].apply(lambda x: gene_dict[x])
    df = df.loc[(df.is_coloc_pair == 1) & (df.has_fithichip_loop == 1)]
    
    if df.shape[0] > 0: 
        data.append(df)
        
data = pd.concat(data)

In [31]:
data

Unnamed: 0,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,has_colocSNP_anchor,eqtl_pval,eqtl_beta,dist,ppH0,ppH1,ppH2,ppH3,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval,gene_start.1,gene_end.1,gene_strand,gwas_source,eqtl_source,ge_source,loop_source
42,10:6068912,rs7090530,ENSG00000213994,AL157395.1,chr10,6068912,6202692,6202693,0,1,0,1,0,0.0274683,0.338513,133781.0,0.0,0.0,0.004833528,0.070616,0.924551,6197612,6202693,C,A,3112.0,0.621406,5008.0,0.1855,0.0245,4.026e-14,6197612,6202693,-,T1D_32005708,Schmiedel_2018,CD8_T-cell_naive,CD8_T-cell_naive
27,21:42405613,rs9981624,ENSG00000160183,TMPRSS3,chr21,42405613,42396845,42396846,1,1,0,1,0,6.3658e-11,0.870668,8767.0,0.0,0.0,0.06053392,0.120225,0.819241,42371837,42396846,G,C,1175.0,0.234625,5008.0,0.152034,0.019004,1.244192e-15,42371837,42396846,-,T1D_25751624,Schmiedel_2018,CD4_T-cell_naive,CD4_T-cell_naive
22,2:203875100,rs231724,ENSG00000163599,CTLA4,chr2,203875100,203867770,203867771,0,1,1,1,0,0.0184669,-0.482733,7329.0,0.0,0.0,3.471615e-09,0.103521,0.896479,203867771,203873965,A,G,2094.0,0.418131,5008.0,0.174961,0.018996,3.245297e-20,203867771,203873965,+,T1D_25751624,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive


### Checking the uniq set of genes

In [32]:
unique_genes = data.geneid.unique()
unique_genes = [x for x in unique_genes if type(x) == str]
unique_gnames = [gene_dict[x] if x in gene_dict else x for x in unique_genes]
unique_genes_df = pd.DataFrame([unique_genes, unique_gnames]).T
unique_genes_df.columns = ['geneid', 'genename']
unique_genes_df

Unnamed: 0,geneid,genename
0,ENSG00000213994,AL157395.1
1,ENSG00000160183,TMPRSS3
2,ENSG00000163599,CTLA4


In [35]:
# find the unique SNPs
uniq_snps_by_cells = data.groupby('ge_source').sid.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = data.groupby('ge_source').geneid.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

In [36]:
# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='pieqtls')

In [37]:
uniq_genes_by_cells

Unnamed: 0_level_0,geneid
ge_source,Unnamed: 1_level_1
CD4_T-cell_naive,1
CD8_T-cell_naive,1
monocyte_CD16_naive,1


In [38]:
excel_analysis

'results/main/sgls/combined/Unique_Counts_By_Cell_Line.xlsx'

#### Write the gene list as well

In [39]:
gh_list = sorted(unique_genes_df.geneid.unique().tolist())
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

In [40]:
gh_fn

'results/main/sgls/combined/gene_list.txt'

### Analyzing the Number of eQTLs, loops, colocaled SNP-Gene (per DataSet)

In [14]:
cells_with_loops = data[data.has_fithichip_loop == 1]
uniq_cells = cells_with_loops[['eqtl_source', 'ge_source', 'loop_source']].values.tolist()
uniq_cells = set([tuple(x) for x in uniq_cells])
uniq_cells = list(uniq_cells)
uniq_cells = pd.DataFrame(uniq_cells)
uniq_cells.columns = ['eqtl_source', 'ge_source', 'loop_source']

MemoryError: Unable to allocate 1.42 GiB for an array with shape (28, 6788669) and data type float64

In [None]:
uniq_cells.sort_values('ge_source')

In [None]:
# group by the dataset combos
eqtl_ge_grps = data.groupby(['eqtl_source', 'ge_source', 'loop_source'])

# calculate the number of sg pairs 
def count_uniq(x):
    v = x[['sid', 'geneid']].values.tolist()
    v = [tuple(x) for x in v]
    v = set(v)
    return(len(v))
eqtl_ge_sg_pairs = eqtl_ge_grps.apply(count_uniq).to_frame()
eqtl_ge_sg_pairs.columns = ['num_sg_pairs']

# calculate the number of sg pairs with loops
eqtl_ge_loops = eqtl_ge_grps.has_fithichip_loop.sum().to_frame()

# calculate the number of sg pairs with significant eQTL signal
eqtl_ge_eqtl = eqtl_ge_grps.is_eqtl_pair.sum().to_frame()

# calculate the number of sg pairs with significant coloc 
eqtl_ge_coloc = eqtl_ge_grps.is_coloc_pair.sum().to_frame()

In [None]:
eqtl_ge_master = eqtl_ge_sg_pairs.merge(eqtl_ge_coloc, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_eqtl, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_loops, left_index=True, right_index=True)

eqtl_ge_master.columns = ['Number of\\nSNP-Gene Pairs',
                          'Number of\\nColoc Pairs',
                          'Number of\\neQTL Pairs',
                          'Number of\\nPairs with a Loop']
eqtl_ge_master.index.names = ['eQTL Source', 'GE Source', 'Loop Source']

In [None]:
HTML(eqtl_ge_master.to_html().replace('\\n', '<br>'))

#### Extract all pairs with a significant eQTL

In [None]:
sg_with_eqtl = data.loc[(data.is_eqtl_pair == 1), major_cols]

In [None]:
print('There are {} SNP-Gene pairs with an eQTL.'.format(len(sg_with_eqtl)))

In [None]:
sg_with_eqtl

#### Extract all pairs with a colocalization

In [None]:
sg_with_coloc = data.loc[(data.is_coloc_pair == 1), major_cols]

In [None]:
print('There are {} SNP-Gene pairs with a coloc.'.format(len(sg_with_coloc)))

In [None]:
sg_with_coloc

#### Extract all pairs with a FitHiChIP Loop

In [None]:
sg_with_loops = data.loc[(data.has_fithichip_loop == 1), major_cols]

In [None]:
print('There are {} SNP-Gene pairs with a loop.'.format(len(sg_with_loops)))

In [None]:
sg_with_loops

## Finalizing the super master table

In [None]:
#data = data.loc[:, new_order]
data.drop_duplicates(subset=['sid', 'geneid', 'eqtl_source', 'ge_source', 'loop_source'], inplace=True)
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.tsv')
data[major_cols].to_csv(master_fn, sep='\t', index=False, na_rep='nan')

In [None]:
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.xlsx')
xdata = data.sort_values(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata = xdata[major_cols].set_index(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata.to_excel(master_fn, na_rep='nan')