In [86]:
import os 
import pandas as pd 
import glob
import json
os.chdir("/mnt/BioHome/jreyna/jreyna/projects/dchallenge/")
outdir = 'results/main/basic_stats/'
os.makedirsrs(outdir, exist_ok=True)

## Summarize GWAS

In [87]:
gwas_fn = 'results/main/2021_Nikhil_eQTL/Data/T1D_GWAS/T1D_34012112_Gaulton/GWAS_input_colocalization_pval_lt_5eMinus8.txt'
gwas = pd.read_table(gwas_fn)

In [88]:
gwas.head()

Unnamed: 0,CHR,POS,BETA,SE,P,N
0,chr1,25293941,0.076575,0.01396,4.13e-08,520580
1,chr1,25294607,0.077268,0.01404,3.73e-08,520580
2,chr1,25294878,0.076429,0.013922,4.03e-08,520580
3,chr1,25295580,0.076999,0.013951,3.4e-08,520580
4,chr1,25296478,0.077267,0.014047,3.78e-08,520580


In [89]:
print('The GWAS has {} snps.'.format(gwas.shape[0]))

The GWAS has 45994 snps.


In [90]:
fn = os.path.join(outdir, 'chiou_et_al.num_gwas_snps.txt')
with open(fn, 'w') as f:
    f.write(str(gwas.shape[0]))

## Summarize eQTLs

In [61]:
eqtl_fn1 = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/BLUEPRINT_eQTL/*.txt.gz'
eqtl_fn1 = glob.glob(eqtl_fn1)
eqtl_fn2 = 'results/main/2021_Nikhil_eQTL/Data/eqtl_sqtl_summ_stats/DICE_eQTL//*.txt.gz'
eqtl_fn2 = glob.glob(eqtl_fn2)
eqtl_fns = eqtl_fn1 + eqtl_fn2

In [62]:
eqtl_data = []
for fn in eqtl_fns:
    df = pd.read_table(fn)

    study = fn.split('/')[5].replace('_eQTL', '')
    cline = fn.split('/')[6].split('.')[0]
    
    df['study'] = study
    df['cline'] = cline 
    
    eqtl_data.append(df)


In [63]:
eqtl_data = pd.concat(eqtl_data)

In [139]:
eqtl_data.head()

Unnamed: 0,pid,nvar,shape1,shape2,dummy,sid,dist,npval,slope,ppval,bpval,qval,study,cline
0,RP11-809C18.3,3958,1.04397,193.339,143.536,10:1116997,442418,6.05014e-06,0.437403,0.003996,0.00711696,0.03315073,BLUEPRINT,Neutrophil
1,WDR37,5379,1.09896,341.843,148.721,10:1108544,13065,1.09987e-07,0.482763,0.000999,0.000171562,0.001220888,BLUEPRINT,Neutrophil
2,ADARB2,5693,1.02652,381.476,150.294,10:1282530,54456,1.5554400000000002e-33,1.06764,0.000999,8.20227e-27,5.319393e-25,BLUEPRINT,Neutrophil
3,RP11-482E14.1,6377,1.03604,509.748,155.394,10:3507739,-21348,3.4101e-07,0.700451,0.000999,0.000693508,0.004405235,BLUEPRINT,Neutrophil
4,RP11-433J20.2,6735,1.123,432.924,145.437,10:4392931,325798,3.39914e-06,-0.208851,0.004995,0.00598754,0.02878668,BLUEPRINT,Neutrophil


In [141]:
eqtl_summary = eqtl_data.groupby(['study', 'cline'])

#### Total number of eQTL's (SNP-gene pairs)

In [166]:
total_eqtls = eqtl_summary.sid.apply(len).reset_index()
total_eqtls.sort_values(['sid', 'study', 'cline'], ascending=[False, True, True], inplace=True)
total_eqtls.head()

Unnamed: 0,study,cline,sid
0,BLUEPRINT,Monocyte,7716
2,BLUEPRINT,T-cell,7211
1,BLUEPRINT,Neutrophil,6399
14,DICE,TH17,2520
4,DICE,CD4_NAIVE,2504


In [167]:
total_eqtls.groupby('study').describe()

Unnamed: 0_level_0,sid,sid,sid,sid,sid,sid,sid,sid
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
study,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BLUEPRINT,3.0,7108.666667,664.436854,6399.0,6805.0,7211.0,7463.5,7716.0
DICE,15.0,2114.866667,220.824386,1826.0,1936.5,2104.0,2219.5,2520.0


#### Total number of unique SNPs per study + cell line

In [168]:
uniq_snps = eqtl_summary.sid.nunique().reset_index()
uniq_snps.sort_values(['sid', 'study', 'cline'], ascending=[False, True, True], inplace=True)

In [169]:
uniq_snps.groupby('study').describe()

Unnamed: 0_level_0,sid,sid,sid,sid,sid,sid,sid,sid
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
study,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BLUEPRINT,3.0,6547.666667,755.469611,5696.0,6253.0,6810.0,6973.5,7137.0
DICE,15.0,2040.266667,225.787026,1749.0,1867.0,2010.0,2147.5,2453.0


#### Total number of unique eGenes per study + cell line

In [170]:
uniq_egenes = eqtl_summary.pid.nunique().reset_index()
uniq_egenes.sort_values(['pid', 'study', 'cline'], ascending=[False, True, True], inplace=True)

In [171]:
uniq_egenes.groupby('study').describe()

Unnamed: 0_level_0,pid,pid,pid,pid,pid,pid,pid,pid
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
study,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
BLUEPRINT,3.0,7108.666667,664.436854,6399.0,6805.0,7211.0,7463.5,7716.0
DICE,15.0,2114.866667,220.824386,1826.0,1936.5,2104.0,2219.5,2520.0


#### Making a master

In [172]:
master_eqtl_table = pd.merge(total_eqtls, uniq_snps, on=['study', 'cline'])
master_eqtl_table = pd.merge(master_eqtl_table, uniq_egenes, on=['study', 'cline'])

In [173]:
master_eqtl_table.head()

Unnamed: 0,study,cline,sid_x,sid_y,pid
0,BLUEPRINT,Monocyte,7716,7137,7716
1,BLUEPRINT,T-cell,7211,6810,7211
2,BLUEPRINT,Neutrophil,6399,5696,6399
3,DICE,TH17,2520,2453,2520
4,DICE,CD4_NAIVE,2504,2434,2504


In [174]:
master_eqtl_table.columns = ['Study', 'Cell Line', 'Number of eQTLs', 'Number of Unique SNPs', 'Number of Unique eGenes']

In [175]:
master_eqtl_table.head()

Unnamed: 0,Study,Cell Line,Number of eQTLs,Number of Unique SNPs,Number of Unique eGenes
0,BLUEPRINT,Monocyte,7716,7137,7716
1,BLUEPRINT,T-cell,7211,6810,7211
2,BLUEPRINT,Neutrophil,6399,5696,6399
3,DICE,TH17,2520,2453,2520
4,DICE,CD4_NAIVE,2504,2434,2504


In [189]:
fn = os.path.join(outdir, 'description_of_eqtl_study.xlsx')
master_eqtl_table.to_excel(fn, index=False)

## Summarize colocalized SNPs

In [177]:
# loading and concat all the data
data = []
for fn in glob.glob('results/main/loop_analysis/Coloc_Approach/T1D_34012112_Gaulton/*/*/*/master.tsv'):
    
    dice_cline = fn.split('/')[5]
    study = fn.split('/')[6]
    eqtl_cline = fn.split('/')[7]
    
    df = pd.read_table(fn, header=0)
    df['dice_cline'] = dice_cline
    df['eqtl_cline'] = eqtl_cline
    df['study'] = study
    data.append(df)
    
data = pd.concat(data)
data.rename(columns={'5kb_gname': 'fivekb_gname'}, inplace=True)

In [178]:
new_order = [
 'sid',
 'rs_id',
 'gene_name',
 'dice_cline',
 'eqtl_cline',
 'study',
 'gene_id',
 'chrom',
 'snp_pos',
 'gene_start',
 'gene_end',
 'is_eqtl_pair',
 'is_coloc_pair',
 'is_closest_gene',
 'has_fithichip_loop',
 'nvar',
 'shape1',
 'shape2',
 'dist',
 'npval',
 'slope',
 'ppval',
 'bpval',
 'qval',
 'pp_H0_Coloc_Summary',
 'pp_H1_Coloc_Summary',
 'pp_H2_Coloc_Summary',
 'pp_H3_Coloc_Summary',
 'pp_H4_Coloc_Summary',
 'ref',
 'alt',
 'AC',
 'AF',
 'AN',
 'gwas_slope',
 'gwas_slope_se',
 'gwas_pval_nominal',
 'SampleSize']

In [179]:
data = data.loc[:, new_order]
data.drop_duplicates(subset=['sid', 'gene_name', 'dice_cline', 'eqtl_cline', 'study'], inplace=True)

In [180]:
# master_fn = os.path.join(outdir, 'super_master.snp_gene_loop.analysis.tsv')
# data.to_csv(master_fn, sep='\t', index=False, na_rep='nan')

## Colocalization Summary

I want to know how many colocalization comparisons there are total

In [190]:
coloc_only = data[(data.is_coloc_pair == 1)]
print(coloc_only.shape[0])

251


I want to know how many colocalizations have a loop

In [191]:
coloc_loops_only = data[(data.is_coloc_pair == 1) & (data.has_fithichip_loop == 1)]
print(coloc_loops_only.shape[0])

50


## Breakdown of colocalized loops (SNP perspective)

I want to know the breakdown in terms of gene

In [192]:
coloc_loops_only.value_counts(subset=['sid', 'gene_name'])

sid          gene_name  
21:43855067  UBASH3A        14
6:90976768   BACH2           7
16:11433103  RMI2            3
11:64107735  AP003774.1      3
12:9833628   RP11-75L1.1     3
11:64107477  AP003774.1      2
20:1610551   SIRPG           2
12:56401085  RPS26           2
16:11439303  RMI2            2
12:9147569   M6PR            1
15:79229199  CTSH            1
15:79231478  CTSH            1
16:11439679  RMI2            1
16:28599411  SULT1A2         1
16:28631530  SULT1A2         1
1:114426001  AP4B1           1
1:114447565  PTPN22          1
1:192537400  RGS1            1
21:43823736  TMPRSS3         1
21:43827765  TMPRSS3         1
11:64102948  AP003774.1      1
dtype: int64

I want to know the breakdown in terms of cell line

In [193]:
coloc_loops_only.value_counts(subset=['gene_name', 'sid', 'dice_cline'])

gene_name    sid          dice_cline
UBASH3A      21:43855067  TREGMEM       2
                          TH2           2
                          TH17          2
                          TH1           2
                          TFH           2
                          THSTAR        2
BACH2        6:90976768   TH17          1
M6PR         12:9147569   TREGNAIVE     1
CTSH         15:79231478  CM            1
             15:79229199  NCM           1
BACH2        6:90976768   TREGNAIVE     1
                          TREGMEM       1
                          THSTAR        1
                          TH2           1
                          TFH           1
                          TH1           1
RGS1         1:192537400  NCM           1
AP4B1        1:114426001  NCM           1
AP003774.1   11:64107735  TREGMEM       1
                          TH2           1
                          NB            1
             11:64107477  THSTAR        1
                          NCM          

## Breakdown of colocalized loops (Gene perspective)

In [194]:
coloc_loops_only.groupby(['gene_name']).rs_id.unique()

gene_name
AP003774.1        [rs663743, rs479777, rs574087]
AP4B1                               [rs11102694]
BACH2                               [rs72928038]
CTSH                    [rs12592898, rs12148472]
M6PR                                 [rs1805721]
PTPN22                               [rs1217397]
RGS1                                 [rs1323297]
RMI2           [rs7187741, rs12149160, rs918738]
RP11-75L1.1                          [rs3764021]
RPS26                               [rs10876864]
SIRPG                                [rs2281808]
SULT1A2                 [rs62031607, rs55792032]
TMPRSS3                   [rs9978717, rs9784215]
UBASH3A                              [rs1893592]
Name: rs_id, dtype: object

In [195]:
coloc_loops_only.groupby(['gene_name']).rs_id.nunique().sort_values()

gene_name
AP4B1          1
BACH2          1
M6PR           1
PTPN22         1
RGS1           1
RP11-75L1.1    1
RPS26          1
SIRPG          1
UBASH3A        1
CTSH           2
SULT1A2        2
TMPRSS3        2
AP003774.1     3
RMI2           3
Name: rs_id, dtype: int64

In [196]:
coloc_loops_only.groupby(['gene_name']).rs_id.nunique().sort_values().shape

(14,)

#### Making a master

In [200]:
coloc_summary = coloc_loops_only.loc[~coloc_loops_only.duplicated(subset=['study', 'eqtl_cline', 'sid', 'gene_id']), :]
coloc_summary = coloc_summary.groupby(['study', 'eqtl_cline']).apply(len)
coloc_summary = coloc_summary.reset_index()
coloc_summary.study = coloc_summary.study.str.replace('_eQTL', '')
coloc_summary = coloc_summary.sort_values(['study', 0, 'eqtl_cline'], ascending=[True, False, True])

coloc_summary.columns = ['Study', 'Cell Line', 'Number of SGLs']

fn = os.path.join(outdir, 'description_of_coloc_study.xlsx')
coloc_summary.to_excel(fn, index=False)