In [1]:
import os 
import pandas as pd 
pd.get_option("display.max_columns", None)
pd.options.display.max_columns = None
import numpy as np
import glob
import json
from IPython.display import HTML
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")
outdir = 'results/main/sgls/combined/'
os.makedirs(outdir, exist_ok=True)

In [2]:
major_cols = [
 'gwas_source',
 'ge_source',
 'loop_source',
 'sid',
 'rsid',
 'geneid',
 'gene_name',
 'chrom',
 'snp_pos',
 'tss_start',
 'tss_end',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'eqtl_pval',
 #'eqtl_fdr',
 'dist',
 'ppH4',
 'gene_start',
 'gene_end',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_pval',
 'gene_strand',
 'eqtl_source']

## Loading GENCODE information

In [40]:
gene_info = pd.read_table('results/refs/gencode/v30/gencode.v30.annotation.bed', header=None)
gene_dict = pd.concat([gene_info.iloc[:, [5,6]], gene_info.iloc[:, [6,5]]])
gene_dict.drop_duplicates(inplace=True)
gene_dict.set_index(5, inplace=True)
gene_dict = gene_dict.squeeze()
gene_dict = gene_dict.to_dict()

## Checking the samplesheet

In [4]:
template = 'results/main/GRCh37/sgls/ldpairs/eQTL_Catalogue/{gwas_source}/{eqtl_source}/{ge_source}/{loop_source}/script_version/master.tsv'

In [5]:
samplesheet = pd.read_table('config/sgl_samplesheets/sgl.samplesheet.tsv', comment='#')
new_cols = []
for i, sr in samplesheet.iterrows():
    
    d = {'gwas_source': sr.gwas,
         'eqtl_source': sr.eqtl_db,
         'ge_source': sr.eqtl_origin,
         'loop_source': sr.loop_origin}
    master_fn = template.format(**d)
    test = os.path.exists(master_fn)
    new_cols.append([test, master_fn])
new_cols = np.array(new_cols)

samplesheet['analyzed'] = new_cols[:, 0]
samplesheet['filename'] = new_cols[:, 1]
samplesheet.sort_values(['analyzed', 'eqtl_db', 'eqtl_origin', 'loop_origin'], \
                       ascending=[False, True, True, True], inplace=True)
samplesheet.reset_index(drop=True, inplace=True)
samplesheet.columns = ['GWAS Source', 'eQTL Source', 'GE Source', 'Loop Source', 'HiChIP Map', 'Analyzed?', 'File Name']

## Combining the other datasets

In [6]:
# loading and concat all the data
data = []
good_samples = samplesheet.loc[(samplesheet['Analyzed?'] == 'True')]
for i, sr in good_samples.iterrows(): 
    
    df = pd.read_table(sr['File Name'], header=0)
    df['gwas_source'] = sr['GWAS Source']
    df['eqtl_source'] = sr['eQTL Source']
    df['ge_source'] = sr['GE Source']
    df['loop_source'] = sr['Loop Source']
    
    # combine the columns gene_id and geneid
    #df.loc[~df['gene_id'].isna(), 'geneid'] = df.loc[~df['gene_id'].isna(), 'gene_id']
    #df.loc[:, 'gene_name'] = df.loc[:, 'geneid'].apply(lambda x: gene_dict[x])
    df = df.loc[(df.is_coloc_pair == 1) & (df.has_fithichip_loop == 1)]
    
    if df.shape[0] > 0: 
        data.append(df)
    

In [7]:
sgl_data = pd.concat(data)

In [68]:
sgl_data.loc[(sgl_data.gene_name == 'TMPRSS3') & (sgl_data.is_ld_snp == 0)]

Unnamed: 0,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,has_colocSNP_anchor,is_ld_snp,eqtl_pval,eqtl_beta,dist,ppH0,ppH1,ppH2,ppH3,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval,gene_start.1,gene_end.1,gene_strand,gwas_source,eqtl_source,ge_source,loop_source
2259,21:43825722,rs9981624,ENSG00000160183,TMPRSS3,chr21,44830000,43816954,43816955,0,1,0,1,1,0,,,,0.0,0.0,0.060011,0.127783,0.812207,43791946,43816955,,,1175.0,0.234625,5008.0,0.152034,0.019004,1.244192e-15,43791946,43816955,-,T1D_25751624,GENCORD,T-cell,CD4_T-cell_naive
3538,21:43825722,rs9981624,ENSG00000160183,TMPRSS3,chr21,44830000,43816954,43816955,0,1,0,1,1,0,,,,0.0,0.0,0.061067,0.112535,0.826398,43791946,43816955,,,1175.0,0.234625,5008.0,0.152034,0.019004,1.244192e-15,43791946,43816955,-,T1D_25751624,Schmiedel_2018,Tfh_memory,CD4_T-cell_naive


In [66]:
sgl_data.loc[(sgl_data.is_eqtl_pair == 1) & (sgl_data.is_ld_snp == 0)]

Unnamed: 0,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,has_colocSNP_anchor,is_ld_snp,eqtl_pval,eqtl_beta,dist,ppH0,ppH1,ppH2,ppH3,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval,gene_start.1,gene_end.1,gene_strand,gwas_source,eqtl_source,ge_source,loop_source


### Checking the uniq set of genes

In [58]:
unique_genes = sgl_data.loc[(sgl_data.is_ld_snp == 0)].geneid.value_counts().to_frame()
unique_gnames = [gene_dict[x] if x in gene_dict else x for x in unique_genes.index.tolist()]
unique_genes.loc[:, 'genename'] = unique_gnames
unique_genes.sort_values('genename', inplace=True)
unique_genes

Unnamed: 0,geneid,genename
ENSG00000230918,3,AC008063.1
ENSG00000271889,1,AC016747.2
ENSG00000259474,1,AC027228.1
ENSG00000224745,1,AC063965.1
ENSG00000107796,1,ACTA2
ENSG00000228124,3,AL096678.1
ENSG00000226455,2,AL121787.1
ENSG00000260271,2,AL132996.1
ENSG00000273797,1,AL133445.2
ENSG00000229664,2,AL137186.1


In [59]:
unique_genes.shape

(28, 2)

In [60]:
unique_genes = sgl_data.loc[(sgl_data.is_ld_snp == 1)].geneid.value_counts().to_frame()
unique_gnames = [gene_dict[x] if x in gene_dict else x for x in unique_genes.index.tolist()]
unique_genes.loc[:, 'genename'] = unique_gnames
unique_genes.sort_values('genename', inplace=True)
unique_genes

Unnamed: 0,geneid,genename
ENSG00000230918,9,AC008063.1
ENSG00000271889,77,AC016747.2
ENSG00000259474,28,AC027228.1
ENSG00000224745,20,AC063965.1
ENSG00000107796,20,ACTA2
ENSG00000228124,107,AL096678.1
ENSG00000226455,64,AL121787.1
ENSG00000260271,89,AL132996.1
ENSG00000273797,32,AL133445.2
ENSG00000229664,7,AL137186.1


In [61]:
unique_genes.shape

(28, 2)

In [10]:
# find the unique SNPs
uniq_snps_by_cells = sgl_data.groupby('ge_source').sid.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = sgl_data.groupby('ge_source').geneid.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

In [11]:
# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='pieqtls')

In [12]:
uniq_genes_by_cells

Unnamed: 0_level_0,geneid
ge_source,Unnamed: 1_level_1
NK-cell_naive,1
T-cell,5
Tfh_memory,7
Th1-17_memory,2
Th17_memory,5
Th1_memory,3
Th2_memory,2
Treg_memory,1
Treg_naive,5
monocyte,3


In [13]:
excel_analysis

'results/main/sgls/combined/Unique_Counts_By_Cell_Line.xlsx'

#### Write the gene list as well

In [14]:
gh_list = sorted(unique_genes_df.geneid.unique().tolist())
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

In [15]:
gh_fn

'results/main/sgls/combined/gene_list.txt'

### Analyzing the Number of eQTLs, loops, colocaled SNP-Gene (per DataSet)

In [16]:
cells_with_loops = sgl_data[sgl_data.has_fithichip_loop == 1]
uniq_cells = cells_with_loops[['eqtl_source', 'ge_source', 'loop_source']].values.tolist()
uniq_cells = set([tuple(x) for x in uniq_cells])
uniq_cells = list(uniq_cells)
uniq_cells = pd.DataFrame(uniq_cells)
uniq_cells.columns = ['eqtl_source', 'ge_source', 'loop_source']

In [17]:
uniq_cells.sort_values('ge_source')

Unnamed: 0,eqtl_source,ge_source,loop_source
6,Schmiedel_2018,NK-cell_naive,NK-cell_naive
0,GENCORD,T-cell,CD4_T-cell_naive
5,BLUEPRINT,T-cell,CD4_T-cell_naive
11,Schmiedel_2018,Tfh_memory,CD4_T-cell_naive
2,Schmiedel_2018,Th1-17_memory,CD4_T-cell_naive
10,Schmiedel_2018,Th17_memory,CD4_T-cell_naive
3,Schmiedel_2018,Th1_memory,CD4_T-cell_naive
14,Schmiedel_2018,Th2_memory,CD4_T-cell_naive
12,Schmiedel_2018,Treg_memory,CD4_T-cell_naive
8,Schmiedel_2018,Treg_naive,CD4_T-cell_naive


In [18]:
# group by the dataset combos
eqtl_ge_grps = sgl_data.groupby(['eqtl_source', 'ge_source', 'loop_source'])

# calculate the number of sg pairs 
def count_uniq(x):
    v = x[['sid', 'geneid']].values.tolist()
    v = [tuple(x) for x in v]
    v = set(v)
    return(len(v))
eqtl_ge_sg_pairs = eqtl_ge_grps.apply(count_uniq).to_frame()
eqtl_ge_sg_pairs.columns = ['num_sg_pairs']

# calculate the number of sg pairs with loops
eqtl_ge_loops = eqtl_ge_grps.has_fithichip_loop.sum().to_frame()

# calculate the number of sg pairs with significant eQTL signal
eqtl_ge_eqtl = eqtl_ge_grps.is_eqtl_pair.sum().to_frame()

# calculate the number of sg pairs with significant coloc 
eqtl_ge_coloc = eqtl_ge_grps.is_coloc_pair.sum().to_frame()

In [19]:
eqtl_ge_master = eqtl_ge_sg_pairs.merge(eqtl_ge_coloc, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_eqtl, left_index=True, right_index=True)
eqtl_ge_master = eqtl_ge_master.merge(eqtl_ge_loops, left_index=True, right_index=True)

eqtl_ge_master.columns = ['Number of\\nSNP-Gene Pairs',
                          'Number of\\nColoc Pairs',
                          'Number of\\neQTL Pairs',
                          'Number of\\nPairs with a Loop']
eqtl_ge_master.index.names = ['eQTL Source', 'GE Source', 'Loop Source']

In [20]:
HTML(eqtl_ge_master.to_html().replace('\\n', '<br>'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of SNP-Gene Pairs,Number of Coloc Pairs,Number of eQTL Pairs,Number of Pairs with a Loop
eQTL Source,GE Source,Loop Source,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BLUEPRINT,T-cell,CD4_T-cell_naive,131,142,0,142
BLUEPRINT,monocyte,monocyte_naive,102,102,0,102
GENCORD,T-cell,CD4_T-cell_naive,110,110,0,110
Quach_2016,monocyte_IAV,monocyte_naive,52,52,0,52
Quach_2016,monocyte_Pam3CSK4,monocyte_naive,35,35,0,35
Quach_2016,monocyte_R848,monocyte_naive,245,245,0,245
Schmiedel_2018,NK-cell_naive,NK-cell_naive,44,44,0,44
Schmiedel_2018,Tfh_memory,CD4_T-cell_naive,369,399,0,399
Schmiedel_2018,Th1-17_memory,CD4_T-cell_naive,70,70,0,70
Schmiedel_2018,Th17_memory,CD4_T-cell_naive,62,62,0,62


#### Extract all pairs with a significant eQTL

In [21]:
sg_with_eqtl = sgl_data.loc[(sgl_data.is_eqtl_pair == 1), major_cols]

In [22]:
print('There are {} SNP-Gene pairs with an eQTL.'.format(len(sg_with_eqtl)))

There are 0 SNP-Gene pairs with an eQTL.


In [23]:
sg_with_eqtl

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source


#### Extract all pairs with a colocalization

In [24]:
sgl_data

Unnamed: 0,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,has_colocSNP_anchor,is_ld_snp,eqtl_pval,eqtl_beta,dist,ppH0,ppH1,ppH2,ppH3,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_slope,gwas_slope_se,gwas_pval,gene_start.1,gene_end.1,gene_strand,gwas_source,eqtl_source,ge_source,loop_source
52,6:90936894,rs206913,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,1,1,,,,1.097200e-145,2.458868e-144,7.771106e-03,0.173335,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,-0.129995,0.018999,7.803700e-12,91030162,91072351,+,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive
53,6:90939178,rs58516003,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,1,1,,,,1.097200e-145,2.458868e-144,7.771106e-03,0.173335,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,-0.129995,0.018999,7.803700e-12,91030162,91072351,+,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive
83,6:90950736,rs2256672,ENSG00000260271,AL132996.1,chr6,91955000,91030161,91030162,0,1,0,1,1,1,,,,1.097200e-145,2.458868e-144,7.771106e-03,0.173335,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,-0.129995,0.018999,7.803700e-12,91030162,91072351,+,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive
111,6:90960329,rs673780,ENSG00000260271,AL132996.1,chr6,91965000,91030161,91030162,0,1,0,1,1,1,,,,1.097200e-145,2.458868e-144,7.771106e-03,0.173335,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,-0.129995,0.018999,7.803700e-12,91030162,91072351,+,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive
139,6:90968949,rs619192,ENSG00000260271,AL132996.1,chr6,91970000,91030161,91030162,0,1,0,1,1,1,,,,1.097200e-145,2.458868e-144,7.771106e-03,0.173335,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,-0.129995,0.018999,7.803700e-12,91030162,91072351,+,T1D_25751624,BLUEPRINT,T-cell,CD4_T-cell_naive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,11:2182224,rs689,ENSG00000229414,KCNQ1-AS1,chr11,3185000,2882797,2882798,0,1,0,1,1,0,,,,0.000000e+00,0.000000e+00,2.138070e-23,0.060991,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,0.308500,0.022200,9.360000e-44,2861365,2882798,-,T1D_34594039_GCST90018925,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive
1118,11:2187855,rs12419447,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,1,1,,,,0.000000e+00,0.000000e+00,2.138070e-23,0.060991,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,0.308500,0.022200,9.360000e-44,2861365,2882798,-,T1D_34594039_GCST90018925,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive
1119,11:2189983,rs4930043,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,1,1,,,,0.000000e+00,0.000000e+00,2.138070e-23,0.060991,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,0.308500,0.022200,9.360000e-44,2861365,2882798,-,T1D_34594039_GCST90018925,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive
1120,11:2188238,rs6357,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,1,1,,,,0.000000e+00,0.000000e+00,2.138070e-23,0.060991,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,0.308500,0.022200,9.360000e-44,2861365,2882798,-,T1D_34594039_GCST90018925,Schmiedel_2018,monocyte_CD16_naive,monocyte_naive


In [25]:
sg_with_coloc = sgl_data.loc[(sgl_data.is_coloc_pair == 1), major_cols]

In [26]:
print('There are {} SNP-Gene pairs with a coloc.'.format(len(sg_with_coloc)))

There are 1525 SNP-Gene pairs with a coloc.


In [27]:
sg_with_coloc

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source
52,T1D_25751624,T-cell,CD4_T-cell_naive,6:90936894,rs206913,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
53,T1D_25751624,T-cell,CD4_T-cell_naive,6:90939178,rs58516003,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
83,T1D_25751624,T-cell,CD4_T-cell_naive,6:90950736,rs2256672,ENSG00000260271,AL132996.1,chr6,91955000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
111,T1D_25751624,T-cell,CD4_T-cell_naive,6:90960329,rs673780,ENSG00000260271,AL132996.1,chr6,91965000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
139,T1D_25751624,T-cell,CD4_T-cell_naive,6:90968949,rs619192,ENSG00000260271,AL132996.1,chr6,91970000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2182224,rs689,ENSG00000229414,KCNQ1-AS1,chr11,3185000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1118,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2187855,rs12419447,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1119,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2189983,rs4930043,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1120,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2188238,rs6357,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018


#### Extract all pairs with a FitHiChIP Loop

In [28]:
sg_with_loops = sgl_data.loc[(sgl_data.has_fithichip_loop == 1), major_cols]

In [29]:
print('There are {} SNP-Gene pairs with a loop.'.format(len(sg_with_loops)))

There are 1525 SNP-Gene pairs with a loop.


In [30]:
sg_with_loops

Unnamed: 0,gwas_source,ge_source,loop_source,sid,rsid,geneid,gene_name,chrom,snp_pos,tss_start,tss_end,is_eqtl_pair,is_coloc_pair,is_closest_gene,has_fithichip_loop,eqtl_pval,dist,ppH4,gene_start,gene_end,ref,alt,AC,AF,AN,gwas_pval,gene_strand,eqtl_source
52,T1D_25751624,T-cell,CD4_T-cell_naive,6:90936894,rs206913,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
53,T1D_25751624,T-cell,CD4_T-cell_naive,6:90939178,rs58516003,ENSG00000260271,AL132996.1,chr6,91940000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
83,T1D_25751624,T-cell,CD4_T-cell_naive,6:90950736,rs2256672,ENSG00000260271,AL132996.1,chr6,91955000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
111,T1D_25751624,T-cell,CD4_T-cell_naive,6:90960329,rs673780,ENSG00000260271,AL132996.1,chr6,91965000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
139,T1D_25751624,T-cell,CD4_T-cell_naive,6:90968949,rs619192,ENSG00000260271,AL132996.1,chr6,91970000,91030161,91030162,0,1,0,1,,,0.818894,91030162,91072351,,,3488.0,0.696486,5008.0,7.803700e-12,+,BLUEPRINT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2182224,rs689,ENSG00000229414,KCNQ1-AS1,chr11,3185000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1118,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2187855,rs12419447,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1119,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2189983,rs4930043,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018
1120,T1D_34594039_GCST90018925,monocyte_CD16_naive,monocyte_naive,11:2188238,rs6357,ENSG00000229414,KCNQ1-AS1,chr11,3190000,2882797,2882798,0,1,0,1,,,0.939009,2861365,2882798,,,3253.0,0.649561,5008.0,9.360000e-44,-,Schmiedel_2018


## Finalizing the super master table

In [31]:
#data = data.loc[:, new_order]
sgl_data.drop_duplicates(subset=['sid', 'geneid', 'eqtl_source', 'ge_source', 'loop_source'], inplace=True)
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.tsv')
sgl_data[major_cols].to_csv(master_fn, sep='\t', index=False, na_rep='nan')

In [32]:
master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.xlsx')
xdata = sgl_data.sort_values(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata = xdata[major_cols].set_index(['rsid', 'gwas_source','eqtl_source', 'ge_source', 'loop_source', 'geneid'])
xdata.to_excel(master_fn, na_rep='nan')