In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import subprocess
from collections import Counter

In [32]:
PROJECT_DIR_d = "/fs/cbsuhy02/storage/yc2553/yc2553/databases/"
PROJECT_DIR_d2 = "/home/yc2553/projects/HEA/databases/"
PROJECT_DIR_o = "/fs/cbsuhy02/storage/yc2553/yc2553/projects/3.Human_atlas/output/"

# Fine-mapped GWAS variants

In [4]:
# Get UKB fine-mapping results (94 complex diseases and traits)
# https://docs.google.com/document/d/14LWxqlSC6hl9FtA984CQjUdFcgQQkXuffYcbXaUoqGM/edit?tab=t.0
# Variants in 95% CSs or with PIPs > 0.001 are included in the primary .tsv or bed files. All fine-mapped regions and variants (including ~1% that failed) are listed in the secondary region .bed file. 
# hg19

inputfile = f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1.bed.gz"
df_gwas = pd.read_table(inputfile, header=None)
df_gwas.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,chr1,755434.0,755435.0,chr1:755435:T:G,rs184270342,T,G,G,UKBB,BOLT,SUSIE,IGF1,chr1:708908-3708908,0.00555,-0.041597,0.015979,6.7767,0.001039,-1,-3.9e-05,0.001322,False,False
1,chr1,759599.0,759602.0,chr1:759600:AGT:A,1:759600_AGT_A,AGT,A,A,UKBB,BOLT,SUSIE,IGF1,chr1:708908-3708908,0.006434,-0.037406,0.014444,6.70666,0.001013,-1,-3.5e-05,0.001175,False,False


In [5]:
# method: fine-mapping method used

Counter(df_gwas[10])

Counter({'SUSIE': 3633569, 'FINEMAP': 1744310})

In [15]:
# pip: posterior probability of association from fine-mapping
# cs_id: ID of 95% credible set (-1 indicates that variant is not in a 95% CS)
# Confirm variants in 95% CSs or with PIPs > 0.001 are included

len(df_gwas[df_gwas[17]>0.001]), len(df_gwas[df_gwas[18] != -1]), len(df_gwas[(df_gwas[17]<=0.001) & (df_gwas[18] == -1)])

(5083378, 1895848, 0)

In [6]:
# LD_HWE: indicator that the variant is in LD (R^2 > 0.6) with a variant that failed Hardy Weinberg equilibrium (p < 10^-12) that was also used in phasing based upon UK10K LD (http://www.nealelab.is/blog/2019/9/17/genotyped-snps-in-uk-biobank-failing-hardy-weinberg-equilibrium-test). These are flagged as lower confidence.
# LD_SV: indicator that the variant is in LD (R^2 > 0.8) with a common structural variant based upon European samples from gnomAD (Collins et al. bioRxiv 2019). These are flagged as lower confidence.

Counter(df_gwas[21]), Counter(df_gwas[22])

(Counter({False: 5033193, True: 344686}),
 Counter({False: 5067456, True: 310423}))

In [16]:
df_gwas_filtered = df_gwas[(df_gwas[21]==False) & (df_gwas[22]==False)]
len(df_gwas), len(df_gwas_filtered)

(5377879, 4827882)

In [19]:
# Keep the information we need
# chromosome: chromosome in hg19 coordinates (autosomes only)
# start: start position of variant in hg19 coordinates (0-indexed)
# end: end position of variant in hg19 coordinates (0-indexed)
# variant: unique variant identifier (chr:pos:ref:alt)
# rsid: rsID identifier
# trait: abbreviation for phenotype used for genetic association tests
# pip: posterior probability of association from fine-mapping

methods = ["SUSIE", "FINEMAP"]
for method in methods:
	# Separate methods in two files
	df = df_gwas_filtered[df_gwas_filtered[10]==method].copy()
	# Only keep SNPs, no indels; otherwise, when liftover to hg38, sometimes it's fragmented into two regions
	df = df[(df[5].isin(["A","T","C","G"])) & (df[6].isin(["A","T","C","G"]))]
	df = df[[0,1,2,3,4,11,17]]
	outputfile = f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_filtered_{method}.bed.gz"
	df[1] = df[1].astype(int)
	df[2] = df[2].astype(int)
	df.to_csv(outputfile, sep="\t", header=None, index=False)
	# Get unique SNPs for liftover
	df = df.drop_duplicates(subset=3)
	outputfile = f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_unique_variant_{method}.bed.gz"
	df.to_csv(outputfile, sep="\t", header=None, index=False)

In [381]:
# Liftover to hg38

for method in methods:
	subprocess.run(" ".join(["CrossMap.py bed",
							 f"{PROJECT_DIR_d}UCSC/liftover/hg19ToHg38.over.chain.gz",
							 f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_unique_variant_{method}.bed.gz", 
							 f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_unique_variant_{method}_hg38.bed",
							]), shell=True)
	subprocess.run("gzip -f " + f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_unique_variant_{method}_hg38.bed", shell=True)

2025-01-20 02:34:51 [INFO]  Read the chain file "/fs/cbsuhy02/storage/yc2553/yc2553/databases/UCSC/liftover/hg19ToHg38.over.chain.gz" 
2025-01-20 02:35:19 [INFO]  Read the chain file "/fs/cbsuhy02/storage/yc2553/yc2553/databases/UCSC/liftover/hg19ToHg38.over.chain.gz" 


In [21]:
# Replace the genomic positions

for method in methods:
	df1 = pd.read_table(f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_filtered_{method}.bed.gz", header=None)
	df2 = pd.read_table(f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_unique_variant_{method}_hg38.bed.gz", header=None)
	df_merge = pd.merge(df1, df2, on=3)
	outputfile = f"{PROJECT_DIR_d}papers/38014075/GWAS/UKBB_94traits_release1_filtered_{method}_hg38.bed.gz"
	df_merge[["0_y", "1_y", "2_y", 3, 4, 5, 6]].to_csv(outputfile, sep="\t", header=None, index=False)

# Fine-mapped eQTL variants

In [38]:
# GTEx fine-mapping results: https://www.finucanelab.org/data/
# https://docs.google.com/document/d/1FwUwptpR2UoG9yGYIxSKJRevMpzeIfUfc5Q0Ut23fy8/edit?tab=t.0
# Variants with PIPs > 0.0001 are included in the primary .tsv.bgz file.

inputfile = f"{PROJECT_DIR_d}papers/38014075/eQTL/GTEx_49tissues_release1.tsv.bgz"
df_eqtl = pd.read_table(inputfile, compression="gzip")
df_eqtl.head(2)

Unnamed: 0,chromosome,start,end,variant,variant_hg38,allele1,allele2,minor_allele,cohort,method,tissue,gene,maf,beta_marginal,se_marginal,z,pip,cs_id,beta_posterior,sd_posterior
0,chr1,13549.0,13550.0,chr1_13550_G_A,chr1_13550_G_A_b38,G,A,A,GTEx,FINEMAP,Adipose_Subcutaneous,ENSG00000177757.2,0.015491,0.322529,0.175853,1.83408,0.000297,1,0.000121,0.00791
1,chr1,13549.0,13550.0,chr1_13550_G_A,chr1_13550_G_A_b38,G,A,A,GTEx,FINEMAP,Adipose_Subcutaneous,ENSG00000227232.5,0.015491,0.243701,0.201673,1.2084,0.000298,1,7.5e-05,0.005642


In [40]:
# method: fine-mapping method used

Counter(df_eqtl["method"])

Counter({'FINEMAP': 29969991, 'SUSIE': 12312896})

In [44]:
# Seems the description is not accurate
# Variants located in 95% CS regardless of PIP are also included

len(df_eqtl[df_eqtl["pip"]<=0.0001]), len(df_eqtl[(df_eqtl["pip"]<=0.000) & (df_eqtl["cs_id"]==-1)])

(2492, 0)

In [58]:
# Keep the information we need
# variant_hg38: unique variant identifier in hg38
# tissue: Tissue of the eQTL effect
# gene: Gene ENSGID
# pip: posterior probability of association from fine-mapping

for method in methods:
	# Separate methods in two files
	df = df_eqtl[df_eqtl["method"]==method].copy()
	df[['chrom', 'end', 'ref', 'alt', 'build']] = df['variant_hg38'].str.split('_', expand=True)
	df = df[(df["ref"].isin(["A","T","C","G"])) & (df["alt"].isin(["A","T","C","G"]))]
	df["end"] = df["end"].astype(int)
	# 0-based
	df["start"] = df["end"]-1
	outputfile = f"{PROJECT_DIR_d}papers/38014075/eQTL/GTEx_49tissues_release1_{method}_hg38.bed.gz"
	df[["chrom", "start", "end", "variant", "tissue", "gene", "pip"]].to_csv(outputfile, sep="\t", header=None, index=False)

In [61]:
# Since one variant can be associated with multiple genes, here we used the maximum PIP for a given variant in a given GTEx tissue.

for method in methods:
	inputfile = f"{PROJECT_DIR_d}papers/38014075/eQTL/GTEx_49tissues_release1_{method}_hg38.bed.gz"
	df = pd.read_table(inputfile, header=None)
	df_max_pip = df.loc[df.groupby([3, 4])[6].idxmax(), [0, 1, 2, 3, 4, 6]]
	outputfile = f"{PROJECT_DIR_d}papers/38014075/eQTL/GTEx_49tissues_release1_maximum_{method}_hg38.bed.gz"
	df_max_pip[[0, 1, 2, 3, 4, 6]].to_csv(outputfile, sep="\t", header=None, index=False)