In [1]:
import os 
import pandas as pd
import subprocess
import glob
import pybedtools as pbt 
pd.set_option('display.max_columns', None)

pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

genome_sizes = 'results/refs/hg19/hg19.chrom.sizes'

# make the directory to save our data
outdir = 'results/notebooks/Intersect_PieQTLs_with_T1D_Significant_GWAS_SNPs/'
os.makedirs(outdir, exist_ok=True)

In [2]:
gs_fn = 'results/refs/hg19/hg19.chrom.nochr.sizes'
gencode_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'
gencode = pd.read_table(gencode_fn, header=None)
gencode = gencode.drop_duplicates(5)
gencode_dict = {k:v for k,v in gencode[[5,6]].values.tolist()} 
gencode_dict.update({v:k for k,v in gencode[[5,6]].values.tolist()})

## Load Significant GWAS

In [3]:
major_gwas = ['T1D_32005708', 'T1D_34594039_GCST90018925', 'T1D_34012112_Gaulton']

In [4]:
data = []
gwas = 'results/main/coloc/Data/T1D_GWAS/*/GRCh37/GWAS_input_colocalization_pval_lt_5eMinus8.txt'
for fn in glob.glob(gwas):
    path_info = fn.split('/')
    df = pd.read_table(fn)
    
    if path_info[5] not in major_gwas:
        continue
        
    df['gwas_source'] = path_info[5]
    data.append(df)    
gwas_df = pd.concat(data)
gwas_df['sid'] = gwas_df['CHR'].str.replace('chr', '') + ':' + gwas_df['POS'].astype(str)

# loading finemap data into bedtools
gwas_bed = gwas_df.iloc[:, [0,1,1]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gwas_bed['start'] = gwas_bed['start'] - 1


In [5]:
individual_gwas_studies = gwas_df.groupby('gwas_source').sid.nunique()
individual_gwas_studies = individual_gwas_studies.to_frame()
individual_gwas_studies.columns = ['Number of Sig. GWAS']

In [6]:
individual_gwas_studies

Unnamed: 0_level_0,Number of Sig. GWAS
gwas_source,Unnamed: 1_level_1
T1D_32005708,21097
T1D_34012112_Gaulton,45798
T1D_34594039_GCST90018925,20419


## Load pieQTLs

In [7]:
pieqtls = glob.glob('results/main/pieqtls/2021_chandra_et_al/*/proximal.pieqtls.tsv')

pieqtl_data = []
for pieqtl in pieqtls:
    print(pieqtl)
    
    cline = pieqtl.split('/')[-2]
    df = pd.read_table(pieqtl)    
    df['cline'] = cline
    pieqtl_data.append(df) 
    
pieqtl_df = pd.concat(pieqtl_data)
pieqtl_df['Chromosome'] = pieqtl_df['Chromosome']
pieqtl_df.loc[:, 'sid'] = pieqtl_df['Chromosome'].str.replace('chr', '') + ':' + \
                          pieqtl_df['pieQTL.Position'].astype(str)

results/main/pieqtls/2021_chandra_et_al/B-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/NK-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/monocyte_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD4_T-cell_naive/proximal.pieqtls.tsv
results/main/pieqtls/2021_chandra_et_al/CD8_T-cell_naive/proximal.pieqtls.tsv


In [8]:
individual_cells = pieqtl_df.groupby('cline').Target_geneID.nunique()
individual_cells = individual_cells.to_frame()
individual_cells.columns = ['Number of pieQTL Genes']
individual_cells

Unnamed: 0_level_0,Number of pieQTL Genes
cline,Unnamed: 1_level_1
B-cell_naive,1233
CD4_T-cell_naive,1168
CD8_T-cell_naive,1081
NK-cell_naive,872
monocyte_naive,1065


In [9]:
individual_cells = pieqtl_df.groupby('cline').sid.nunique()
individual_cells = individual_cells.to_frame()
individual_cells.columns = ['Number of pieQTL SNPs']
individual_cells

Unnamed: 0_level_0,Number of pieQTL SNPs
cline,Unnamed: 1_level_1
B-cell_naive,8110
CD4_T-cell_naive,6800
CD8_T-cell_naive,5539
NK-cell_naive,5332
monocyte_naive,6006


In [10]:
pieqtl_bed = pieqtl_df.iloc[:, [1,2,2,-1]]
pieqtl_bed.columns = ['chrom', 'start', 'end', 'cline']
pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1
pieqtl_pbt = pbt.BedTool.from_dataframe(pieqtl_bed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pieqtl_bed.loc[:, 'start'] = pieqtl_bed['start'] - 1


In [11]:
pieqtl_bed.shape

(35898, 4)

## Intersect pieQTLs and Sig GWAS

In [12]:
major_cols = ['ge_source',
              'rsid',
             'chrom',
             'pos',
             'geneid',
             'genename',
             'allele1',
             'allele2',
             'maf',
             'beta_x',
             'se',
             'z',
             'prob',
             'log10bf',
             'mean',
             'sd',
             'mean_incl',
             'sd_incl',
             'pval',
             'gwas_source',
             'pieQTL.ID',
             'Chromosome',
             'pieQTL.Position',
             'Target_geneName',
             'TSS',
             'pvalue',
             'FDR (DICE)',
             'beta_y',
             'ref',
             'alt',
             'Mean.TPM.Homozygous.Reference',
             'Mean.TPM.Heterozygous',
             'Mean.TPM.Homozygous.Alternative',
             'Interaction_type',
             'GWAS.Trait',
             'cline',
             'regionID',
             'GWASLoci',
             'index']


### Intersection

In [13]:
intersect_df = pd.merge(gwas_df, pieqtl_df,
                        left_on=['CHR', 'POS'],
                        right_on=['Chromosome', 'pieQTL.Position'],
                        how='inner')

In [14]:
indiv_cell_intersect = intersect_df.groupby('cline').Target_geneID.nunique().to_frame()
indiv_cell_intersect.columns = ['Number of Unique Genes from pieQTL & Sig.GWAS Intersection']
indiv_cell_intersect

Unnamed: 0_level_0,Number of Unique Genes from pieQTL & Sig.GWAS Intersection
cline,Unnamed: 1_level_1
B-cell_naive,39
CD4_T-cell_naive,33
CD8_T-cell_naive,35
NK-cell_naive,45
monocyte_naive,25


In [15]:
indiv_cell_intersect = intersect_df.groupby('cline').sid_x.nunique().to_frame()
indiv_cell_intersect.columns = ['Number of Unique SNPs from pieQTL & Sig.GWAS Intersection']
indiv_cell_intersect

Unnamed: 0_level_0,Number of Unique SNPs from pieQTL & Sig.GWAS Intersection
cline,Unnamed: 1_level_1
B-cell_naive,348
CD4_T-cell_naive,322
CD8_T-cell_naive,269
NK-cell_naive,319
monocyte_naive,116


In [16]:
indiv_cell_intersect = intersect_df.groupby(['cline', 'gwas_source']).Target_geneID.nunique().to_frame()
indiv_cell_intersect.columns = ['Number of Unique Genes from pieQTL & Sig.GWAS Intersection']
indiv_cell_intersect

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Unique Genes from pieQTL & Sig.GWAS Intersection
cline,gwas_source,Unnamed: 2_level_1
B-cell_naive,T1D_32005708,28
B-cell_naive,T1D_34012112_Gaulton,34
B-cell_naive,T1D_34594039_GCST90018925,21
CD4_T-cell_naive,T1D_32005708,15
CD4_T-cell_naive,T1D_34012112_Gaulton,29
CD4_T-cell_naive,T1D_34594039_GCST90018925,14
CD8_T-cell_naive,T1D_32005708,18
CD8_T-cell_naive,T1D_34012112_Gaulton,30
CD8_T-cell_naive,T1D_34594039_GCST90018925,15
NK-cell_naive,T1D_32005708,34


In [17]:
indiv_cell_intersect = intersect_df.groupby(['cline', 'gwas_source']).sid_x.nunique().to_frame()
indiv_cell_intersect.columns = ['Number of Unique SNPs from pieQTL & Sig.GWAS Intersection']
indiv_cell_intersect

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Unique SNPs from pieQTL & Sig.GWAS Intersection
cline,gwas_source,Unnamed: 2_level_1
B-cell_naive,T1D_32005708,186
B-cell_naive,T1D_34012112_Gaulton,241
B-cell_naive,T1D_34594039_GCST90018925,116
CD4_T-cell_naive,T1D_32005708,142
CD4_T-cell_naive,T1D_34012112_Gaulton,198
CD4_T-cell_naive,T1D_34594039_GCST90018925,53
CD8_T-cell_naive,T1D_32005708,136
CD8_T-cell_naive,T1D_34012112_Gaulton,155
CD8_T-cell_naive,T1D_34594039_GCST90018925,73
NK-cell_naive,T1D_32005708,171


In [18]:
def get_genename(x):
    if x in gencode_dict:
        return(gencode_dict[x])
    else:
        return(x)

In [19]:
intersect_df.loc[:, 'geneid'] = intersect_df['Target_geneID'].str.replace('\.[0-9]*', '', regex=True)
intersect_df.loc[:, 'genename'] = intersect_df.loc[:, 'geneid'].apply(get_genename)

In [29]:
intersect_df.loc[:, 'geneid'].nunique()

91

In [20]:
for x in intersect_df.genename.unique():
    print(x)

C1orf216
SF3A3
FHL3
PTPN22
RGS1
NPM1P33
KRT18P39
CD28
SLC22A5
BTN3A1
BTN2A2
ZSCAN26
ZNF165
AL645939.1
IFITM4P
ZDHHC20P1
ZFP57
AL645929.1
HCG4B
AL671277.2
RNF39
FLOT1
DHX16
PPP1R18
ZNRD1ASP
TRIM26
VARS2
MRPS18B
TUBB
C6orf136
RF00019
PRRC2A
BAG6
CCHCR1
CSNK2B
C6orf47
TCF19
HCG27
AL662844.3
AL662844.4
MICB
MICA
AL645933.2
LST1
NCR3
LY6G5B
LY6G5C
C6orf48
AIF1
MSH5
PPT2
SKIV2L
BTNL2
ENSG00000228962
TSBP1
HCG24
TAP2
WDR46
PFDN6
RPL32P1
BACH2
AP003774.4
M6PR
LINC02390
SUOX
ATXN2
TMEM116
HECTD4
MAPKAPK5
CTSH
AC009121.1
RMI2
DND1P1
KANSL1-AS1
AP005482.1
ZGLP1
PRKD2
TMPRSS3
ZSCAN9
TRIM27
ENSG00000261353
BTN2A3P
ZKSCAN4
BTN2A1
BTN3A2
HMGN4
HCG18
GNL1
ZSCAN23
ZNF192P1


## Summarize

In [21]:
# find the unique SNPs
uniq_snps_by_cells = intersect_df.groupby('cline').sid_x.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = intersect_df.groupby('cline').geneid.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

In [22]:
# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='pieqtls')

In [23]:
uniq_genes_by_cells

Unnamed: 0_level_0,geneid
cline,Unnamed: 1_level_1
B-cell_naive,39
CD4_T-cell_naive,33
CD8_T-cell_naive,35
NK-cell_naive,45
monocyte_naive,25


In [24]:
excel_analysis

'results/notebooks/Intersect_PieQTLs_with_T1D_Significant_GWAS_SNPs/Unique_Counts_By_Cell_Line.xlsx'

#### Write the gene list as well

In [25]:
gh_list = sorted(intersect_df.geneid.unique().tolist())
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

In [30]:
gh_list = sorted(intersect_df.geneid.unique().tolist())


In [32]:
len(gh_list)

91

In [26]:
intersect_df[['cline', 'geneid']]

Unnamed: 0,cline,geneid
0,monocyte_naive,ENSG00000142686
1,NK-cell_naive,ENSG00000183431
2,CD4_T-cell_naive,ENSG00000183386
3,CD8_T-cell_naive,ENSG00000183386
4,NK-cell_naive,ENSG00000183431
...,...,...
2137,CD4_T-cell_naive,ENSG00000271821
2138,CD4_T-cell_naive,ENSG00000206344
2139,CD4_T-cell_naive,ENSG00000271821
2140,NK-cell_naive,ENSG00000206344


#### Get the unique genes per cell type 

In [27]:
genes_by_cell = intersect_df[['cline', 'geneid']].drop_duplicates()
genes_by_cell.sort_values(['cline', 'geneid'], inplace=True)
genes_by_cell['source'] = 'pieqtls_with_sig_gwas'
genes_by_cell.columns = ['cline', 'geneid', 'source']
fn = os.path.join(outdir, 'genes_by_cell.xlsx')
genes_by_cell.to_excel(fn, index=False)