In [1]:
import pandas as pd
import numpy as np

import hail as hl
from hail.linalg import BlockMatrix
hl.init()


Running on Apache Spark version 3.3.2
SparkUI available at http://hn-cluster-m.europe-west1-d.c.open-targets-genetics-dev.internal:36427
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.120-f00f916faf78
LOGGING: writing to /home/hn9/hail-20231010-1600-0.2.120-f00f916faf78.log


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Finemapping Pipeline") \
    .getOrCreate()


In [3]:
def get_gnomad_ld_matrix(lead_snp_ID):
    rg38 = hl.get_reference('GRCh38')
    rg37 = hl.get_reference('GRCh37')
    rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    rg38.add_liftover('gs://hail-common/references/grch38_to_grch37.over.chain.gz', rg37)
    bm = BlockMatrix.read("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.bm")
    variant_table = hl.read_table("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.variant_indices.ht")
    # Liftover lead SNP ID to GRCh37
    locus = hl.parse_variant(lead_snp_ID,reference_genome='GRCh38').locus
    temp_table = hl.Table.parallelize([hl.struct(locus=locus)])
    locus_values = temp_table.select('locus').collect()
    locus_value = locus_values[0].locus
    contig = locus_value.contig
    position = locus_value.position
    locus_37 = hl.liftover(hl.locus(contig, position, 'GRCh38'), 'GRCh37')

    # Get LD matrix
    window_size = 500000
    locus_variants = variant_table.filter(
        (hl.abs(variant_table.locus.position - locus_37.position) <= window_size) &
        (variant_table.locus.contig == locus_37.contig)
    )
    indices = locus_variants['idx'].collect()
    sub_bm = bm.filter(indices, indices)
    numpy_array = sub_bm.to_numpy()
    ld_df = pd.DataFrame(numpy_array)
    # need to change to iteritems due to old pandas version error
    ld_df.iteritems = ld_df.items

    # Get SNP IDs in GRCh38
    locus_variants = locus_variants.annotate(
        locus_38 = hl.liftover(locus_variants.locus, 'GRCh38')
    )
    locus_variants = locus_variants.annotate(
        snp_id_38 = hl.str(locus_variants.locus_38.contig) + "_" +
                    hl.str(locus_variants.locus_38.position) + "_" +
                    locus_variants.alleles[0] + "_" +
                    locus_variants.alleles[1]
    )
    snp_ids_38 = locus_variants['snp_id_38'].collect()
    ld_df.columns = snp_ids_38
    ld_pyspark_df = spark.createDataFrame(ld_df)

    return ld_pyspark_df, snp_ids_38

In [4]:
ld_pyspark_df, snp_ids_38 = get_gnomad_ld_matrix("chr8:27610986:C:A")

In [None]:
def get_matching_snps(ld_pyspark_df, snp_ids_38):
    study_snps = [row.snp_id_38 for row in StudyLocus.select("variantID").collect()]
    study_snps = set(study_snps)
    # Find the intersection of the SNPs
    common_snps = study_snps.intersection(snp_ids_38)
    
    # Filter StudyLocus to only include common SNPs
    filtered_StudyLocus = StudyLocus.filter(F.col("snp_id_38").isin(common_snps))
    
    # Filter LD matrix to only include the common SNPs
    selected_columns = [col for col in ld_pyspark_df.columns if col in common_snps]
    filtered_ld_pyspark_df = ld_pyspark_df.select(selected_columns)
    
    return filtered_ld_pyspark_df, filtered_StudyLocus

# Testing outside of function:

In [6]:
rg38 = hl.get_reference('GRCh38')
rg37 = hl.get_reference('GRCh37')
rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
rg38.add_liftover('gs://hail-common/references/grch38_to_grch37.over.chain.gz', rg37)

In [9]:
lead_snp_ID = "chr8:27610986:C:A"
bm = BlockMatrix.read("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.bm")
variant_table = hl.read_table("gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.nfe.common.adj.ld.variant_indices.ht")
# Liftover lead SNP ID to GRCh37
locus = hl.parse_variant(lead_snp_ID,reference_genome='GRCh38').locus
temp_table = hl.Table.parallelize([hl.struct(locus=locus)])
locus_values = temp_table.select('locus').collect()
locus_value = locus_values[0].locus
contig = locus_value.contig
position = locus_value.position
locus_37 = hl.liftover(hl.locus(contig, position, 'GRCh38'), 'GRCh37')

# Get LD matrix
window_size = 500000
locus_variants = variant_table.filter(
    (hl.abs(variant_table.locus.position - locus_37.position) <= window_size) &
    (variant_table.locus.contig == locus_37.contig)
)
indices = locus_variants['idx'].collect()
sub_bm = bm.filter(indices, indices)
numpy_array = sub_bm.to_numpy()
ld_df = pd.DataFrame(numpy_array)
# need to change to iteritems due to old pandas version error
ld_df.iteritems = ld_df.items
ld_pyspark_df = spark.createDataFrame(ld_df)

In [10]:
ld_pyspark_df.printSchema()

root
 |-- 0: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)
 |-- 13: double (nullable = true)
 |-- 14: double (nullable = true)
 |-- 15: double (nullable = true)
 |-- 16: double (nullable = true)
 |-- 17: double (nullable = true)
 |-- 18: double (nullable = true)
 |-- 19: double (nullable = true)
 |-- 20: double (nullable = true)
 |-- 21: double (nullable = true)
 |-- 22: double (nullable = true)
 |-- 23: double (nullable = true)
 |-- 24: double (nullable = true)
 |-- 25: double (nullable = true)
 |-- 26: double (nullable = true)
 |-- 27: double (nullable = true)
 |-- 28: double (nullable = true)
 |-- 29: double (nu

In [14]:
ld_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5198,5199,5200,5201,5202,5203,5204,5205,5206,5207
0,1.0,0.010614,-0.084581,-0.991658,-0.117947,0.994123,-0.164365,-0.197324,-0.055195,-0.080704,...,-0.002417,0.002095,0.000605,0.041263,-0.003097,0.000800,-0.017153,0.001865,-0.002144,0.000099
1,0.0,1.000000,0.004777,0.032266,0.007932,0.006794,0.004736,0.002940,0.000669,0.001326,...,0.011473,0.002612,-0.052152,-0.047007,0.005065,0.007013,0.005431,0.004873,0.005062,0.005014
2,0.0,0.000000,1.000000,0.084156,-0.015421,-0.085327,0.214229,-0.044191,-0.022065,-0.008989,...,0.009414,-0.002061,0.020580,-0.012528,0.005344,0.029391,0.022161,0.028194,0.005250,0.005714
3,0.0,0.000000,0.000000,1.000000,0.118738,-0.994835,0.164127,0.197554,0.054447,0.081230,...,0.003922,-0.000941,-0.002872,-0.043145,0.004130,-0.000851,0.017985,0.000157,0.003299,0.000777
4,0.0,0.000000,0.000000,0.000000,1.000000,-0.118018,0.349038,-0.078700,-0.021882,-0.048128,...,0.029716,0.002448,-0.015684,-0.027860,0.044722,-0.006272,-0.005358,0.018813,0.046526,0.046688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5203,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.771106,-0.023609,0.197099,0.197898
5204,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,-0.023988,0.160840,0.159666
5205,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.140870,0.139334
5206,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.993329


In [11]:
# Get SNP IDs in GRCh38
locus_variants = locus_variants.annotate(
    locus_38 = hl.liftover(locus_variants.locus, 'GRCh38')
)
locus_variants = locus_variants.annotate(
    snp_id_38 = hl.str(locus_variants.locus_38.contig) + "_" +
                hl.str(locus_variants.locus_38.position) + "_" +
                locus_variants.alleles[0] + "_" +
                locus_variants.alleles[1]
)
snp_ids_38 = locus_variants['snp_id_38'].collect()

In [12]:
snp_ids_38

['chr8_27111015_G_A',
 'chr8_27111091_T_C',
 'chr8_27111105_G_A',
 'chr8_27111258_C_T',
 'chr8_27111328_A_G',
 'chr8_27111470_G_A',
 'chr8_27111699_G_GGT',
 'chr8_27111699_G_GGTGTGT',
 'chr8_27111699_G_GGTGTGTGT',
 'chr8_27111701_T_G',
 'chr8_27111716_G_GTATATATA',
 'chr8_27111724_G_GTATATATA',
 'chr8_27111735_T_C',
 'chr8_27111826_T_A',
 'chr8_27112042_T_C',
 'chr8_27112084_A_G',
 'chr8_27112121_G_GA',
 'chr8_27112174_A_T',
 'chr8_27112269_A_C',
 'chr8_27112275_C_T',
 'chr8_27112523_A_G',
 'chr8_27112526_A_G',
 'chr8_27112613_A_T',
 'chr8_27113048_C_T',
 'chr8_27113539_C_G',
 'chr8_27113626_C_T',
 'chr8_27113652_G_A',
 'chr8_27113671_C_G',
 'chr8_27113674_C_T',
 'chr8_27113675_G_A',
 'chr8_27113687_G_A',
 'chr8_27113689_G_A',
 'chr8_27113770_C_T',
 'chr8_27113775_C_T',
 'chr8_27113862_G_T',
 'chr8_27113885_G_A',
 'chr8_27113992_C_T',
 'chr8_27114112_G_A',
 'chr8_27115682_C_CA',
 'chr8_27115784_T_A',
 'chr8_27116512_T_A',
 'chr8_27118979_A_G',
 'chr8_27119059_T_C',
 'chr8_27119209_A_G'