In [1]:
#imports

from gentropy.common.session import Session
from gentropy.dataset.summary_statistics import SummaryStatistics
from pyspark.sql.functions import col
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.study_index import StudyIndex
import hail as hl
import os

### Set up session and import data, glimpse data

In [3]:

#create session
#increase ram
# session.spark.conf.set("spark.executor.memory", "6g")
session = Session(extended_spark_conf={"spark.driver.memory": "4g"})

#path_cred_sets="gs://genetics_etl_python_playground/releases/24.03/credible_set/finngen_susie"
#local path
path_cred_sets="data/"
sl=StudyLocus.from_parquet(session=session,path=path_cred_sets)

df=sl.df
df.show()

df.printSchema()
df.groupBy('chromosome').count().show()

# Chromosome to analyze
chromosome = 1


24/06/19 13:01:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


ValueError: Parquet file is empty: data/

#### Count rows of data for FINNGEN

In [76]:
sl.df=df

df.show()
#total rows
df.count()
# 13966

+--------------------+----------------+----------+---------+--------------------+--------------------+----------+------+--------------+--------------+-------------------------------+-------------+-------------------+---------------+-----------------+----------------+------------------+------------+-----------+----------+-----+--------------------+
|        studyLocusId|       variantId|chromosome| position|              region|             studyId|      beta|zScore|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|standardError|subStudyDescription|qualityControls|finemappingMethod|credibleSetIndex|credibleSetlog10BF|purityMeanR2|purityMinR2|sampleSize|ldSet|               locus|
+--------------------+----------------+----------+---------+--------------------+--------------------+----------+------+--------------+--------------+-------------------------------+-------------+-------------------+---------------+-----------------+----------------+------------------+------------+-

13966

#### Number of distinct studies

1194 studies in total

In [75]:
#number of unique study ids
df.select('studyId').distinct().count() # number of unique study ids/traits
# 1194

1194

#### Load study index

##### Number of distinct studies in index

2408

In [78]:
study_index_df=study_index.df
#number of unique study ids
print(study_index_df.select('studyId').distinct().count()) # number of unique study ids/traits
#number of rows
print(study_index_df.count())
# Both 2408 so rows = unique study ids

2408
2408


##### Join the study locus table and study index to find overlap

In [79]:
joined_df = df.join(study_index_df, on='studyId', how='inner')
joined_df.show(5)
print(joined_df.count())
# 13966 # No data loss in join

left_joined_df = df.join(study_index_df, on='studyId', how='left')
left_joined_df.show(5)
print(left_joined_df.count())
# 13966 # No data loss in left join

+--------------------+--------------------+---------------+----------+---------+--------------------+----------+------+--------------+--------------+-------------------------------+-------------+-------------------+---------------+-----------------+----------------+------------------+------------+-----------+----------+-----+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+
|             studyId|        studyLocusId|      variantId|chromosome| position|              region|      beta|zScore|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|standardError|subStudyDescription|qualityControls|finemappingMethod|credible

In [80]:
# filter out rows where studyId is null - no entries present
#df.filter(col("studyId").isNull()).show()

#unique trait id arrays
study_index_df.select('traitFromSourceMappedIds').distinct().count()


1214

##### Check anit-filter for rows removed

In [82]:
# show data that has empty traitFromSourceMappedIds
empty_trait_df = left_joined_df.filter((col("traitFromSourceMappedIds").isNull()) | (size(col("traitFromSourceMappedIds")) == 0))
empty_trait_df.show(5)

+--------------------+--------------------+------------------+----------+---------+--------------------+----------+------+--------------+--------------+-------------------------------+-------------+-------------------+---------------+-----------------+----------------+------------------+------------+-----------+----------+-----+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+
|             studyId|        studyLocusId|         variantId|chromosome| position|              region|      beta|zScore|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|standardError|subStudyDescription|qualityControls|finemappingMethod|cr

##### Counts for df sizes

Study Locus df = 13966

Joined SL and SI df = 13966

Filtered left-joined DF = 8418

In [83]:
print(df.count())
print(joined_df.count())
print(filtered_df.count())


13966
13966
8418


# Run from here

In [29]:
#imports

from gentropy.common.session import Session
from gentropy.dataset.summary_statistics import SummaryStatistics
from pyspark.sql.functions import col
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.study_index import StudyIndex
import hail as hl
import os

# Chromosome to analyze
chromosome = 1

### Load study Index

In [12]:
# load study index
path_si="data/index/"
study_index=StudyIndex.from_parquet(session=session, path=path_si)

#### Use Gentropy SL overlap function to find overlap between CSs for using COLOC

In [3]:
from gentropy.dataset.study_locus import StudyLocus, StudyLocusOverlap
from datetime import datetime
#create session
session = Session(extended_spark_conf={"spark.driver.memory": "6g"})

#path_cred_sets="gs://genetics_etl_python_playground/releases/24.03/credible_set/finngen_susie"
#local path
path_cred_sets="data/study_locus/"
sl=StudyLocus.from_parquet(session=session,path=path_cred_sets)
study_index=StudyIndex.from_parquet(session=session, path="data/index/")
# Find Overlaps
study_locus_overlap_intra = sl.find_overlaps(study_index, intra_study_overlap=True)
study_locus_overlap_inter = sl.find_overlaps(study_index, intra_study_overlap=False)
study_locus_combined = study_locus_overlap_intra.df.unionByName(study_locus_overlap_inter.df)
print(study_locus_combined)
print(type(study_locus_combined))
date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
study_locus_combined.write.parquet(f"data/overlaps_{date}/")



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/06/19 13:06:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

24/06/19 13:06:29 WARN CacheManager: Asked to cache already cached data.
DataFrame[leftStudyLocusId: bigint, rightStudyLocusId: bigint, chromosome: string, tagVariantId: string, statistics: struct<left_logBF:double,left_posteriorProbability:double,left_beta:double,left_pValueMantissa:float,left_pValueExponent:int,right_logBF:double,right_posteriorProbability:double,right_beta:double,right_pValueMantissa:float,right_pValueExponent:int>]
<class 'pyspark.sql.dataframe.DataFrame'>


                                                                                

In [8]:
# Count the number of rows in the overlap DataFrame
print(study_locus_overlap_intra.df.show())
print(study_locus_overlap_intra.df.count())
print(study_locus_overlap_inter.df.show())
print(study_locus_overlap_inter.df.count())

+----------------+-----------------+----------+------------+----------+
|leftStudyLocusId|rightStudyLocusId|chromosome|tagVariantId|statistics|
+----------------+-----------------+----------+------------+----------+
+----------------+-----------------+----------+------------+----------+

None
0


                                                                                

+--------------------+--------------------+----------+----------------+--------------------+
|    leftStudyLocusId|   rightStudyLocusId|chromosome|    tagVariantId|          statistics|
+--------------------+--------------------+----------+----------------+--------------------+
|-3544786763432086627|-9215091430977920480|         1| 1_26607100_AC_A|{11.005551205553,...|
|-3544786763432086627|-9215091430977920480|         1|  1_26675220_A_G|{11.8288220892832...|
|-3544786763432086627|-9215091430977920480|         1|  1_26675402_G_A|{11.5844899925472...|
|-3544786763432086627|-9215091430977920480|         1|  1_26676156_T_C|{11.820897830622,...|
|-3544786763432086627|-9215091430977920480|         1|  1_26688257_C_G|{11.6264599364101...|
|-3544786763432086627|-9215091430977920480|         1|  1_26710663_T_G|{11.1957213194904...|
|-3544786763432086627|-9215091430977920480|         1|  1_26724732_A_G|{11.1957213194904...|
|-3544786763432086627|-9215091430977920480|         1|  1_26727525_A_G



4537734


                                                                                

#### Import StudyLocus Overlaps INTER AND INTRA for coloc

In [9]:
study_locus_overlaps = StudyLocusOverlap.from_parquet(session=session, path="data/overlaps_2024-06-19_13-06-30/")
# count the number of rows in the overlap DataFrame
print(study_locus_overlaps.df.count())

4537734


##### Example of output

In [132]:
print(study_locus_overlaps.df.show(50))
df = study_locus_overlaps.df
filtered_row = df.filter((df.leftStudyLocusId == -3544786763432086627) & (df.rightStudyLocusId == -3544786763432086627))
print(filtered_row.show())
# no overlaps for a single locus therefore no colocalisation found.

                                                                                

+--------------------+--------------------+----------+------------------+--------------------+
|    leftStudyLocusId|   rightStudyLocusId|chromosome|      tagVariantId|          statistics|
+--------------------+--------------------+----------+------------------+--------------------+
|-3544786763432086627|-9215091430977920480|         1|   1_26607100_AC_A|{11.005551205553,...|
|-3544786763432086627|-9215091430977920480|         1|    1_26675220_A_G|{11.8288220892832...|
|-3544786763432086627|-9215091430977920480|         1|    1_26675402_G_A|{11.5844899925472...|
|-3544786763432086627|-9215091430977920480|         1|    1_26676156_T_C|{11.820897830622,...|
|-3544786763432086627|-9215091430977920480|         1|    1_26688257_C_G|{11.6264599364101...|
|-3544786763432086627|-9215091430977920480|         1|    1_26710663_T_G|{11.1957213194904...|
|-3544786763432086627|-9215091430977920480|         1|    1_26724732_A_G|{11.1957213194904...|
|-3544786763432086627|-9215091430977920480|       



+----------------+-----------------+----------+------------+----------+
|leftStudyLocusId|rightStudyLocusId|chromosome|tagVariantId|statistics|
+----------------+-----------------+----------+------------+----------+
+----------------+-----------------+----------+------------+----------+

None


                                                                                

#### Colocalisation for SL overlap

In [10]:
from gentropy.method.colocalisation import Coloc
from gentropy.dataset.colocalisation import Colocalisation
import pyspark.sql.functions as F

# 1. Perform Colocalization
coloc_results = Coloc.colocalise(study_locus_overlaps)

print(coloc_results.df.show())

# 2. Analyze Colocalization Results
# Filter for significant colocalization (e.g., using a threshold for PP.H4)
significant_coloc = coloc_results.filter(F.col("h4") > 0.8)

# Print the results
print(significant_coloc.df.show())

# Print subset 7739811754326032609
filtered_row = coloc_results.df.filter((coloc_results.df.leftStudyLocusId == -7739811754326032609) | (coloc_results.df.rightStudyLocusId == -7739811754326032609))
print(filtered_row.show())


                                                                                

+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|chromosome|    leftStudyLocusId|   rightStudyLocusId|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                  h4|          log2h4h3|colocalisationMethod|
+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|         1|-7739811754326032609|-8580726767903703361|                         9|1.982276920469156...|1.341592281969666...|3.353873509699807...|  0.0217208566430716|  0.9779437558698183|5.4925985869166185|               COLOC|
|         1|-7180920194711895424|-8803538001521518159|                         2|1.445687016

                                                                                

+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|chromosome|    leftStudyLocusId|   rightStudyLocusId|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                h4|          log2h4h3|colocalisationMethod|
+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|         1|-7739811754326032609|-8580726767903703361|                         9|1.982276920469156...|1.341592281969666...|3.353873509699807...|  0.0217208566430716|0.9779437558698183|5.4925985869166185|               COLOC|
|         1|-7180920194711895424|-8803538001521518159|                         2|1.445687016280731..



+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|chromosome|    leftStudyLocusId|   rightStudyLocusId|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                h4|          log2h4h3|colocalisationMethod|
+----------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|         1|-6077882869059078660|-7739811754326032609|                         9|5.51113453516146E-11|2.079928035621036...|3.729900439702032E-9|0.013090117406130183|0.9867018860052956| 6.236064324417278|               COLOC|
|         1|-7739811754326032609|-8580726767903703361|                         9|1.982276920469156..

                                                                                

In [14]:
# Running coloc step function for debugging
from gentropy.colocalisation import ColocalisationStep
coloc = ColocalisationStep(session=session, credible_set_path=path_cred_sets, study_index_path=path_si, coloc_path = "output", colocalisation_method="Coloc")


24/06/19 13:20:43 WARN CacheManager: Asked to cache already cached data.
24/06/19 13:20:43 WARN CacheManager: Asked to cache already cached data.


                                                                                

In [15]:
print(coloc)
from gentropy.dataset.colocalisation import Colocalisation
output_path = "output/coloc/"
coloc_step_results = Colocalisation.from_parquet(session=session,path=output_path)
print(coloc_step_results.df.show())

# Print subset -7739811754326032609
filtered_row = coloc_step_results.df.filter((coloc_step_results.df.leftStudyLocusId == -7739811754326032609) | (coloc_step_results.df.rightStudyLocusId == -7739811754326032609))
print(filtered_row.show())
# Find any matching rows
for row in coloc_step_results.df.collect():
    if row.leftStudyLocusId == row.rightStudyLocusId:
        print(row)

<gentropy.colocalisation.ColocalisationStep object at 0x7f4b503fb5b0>
+--------------------+--------------------+----------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+----+
|    leftStudyLocusId|   rightStudyLocusId|chromosome|colocalisationMethod|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                  h4|          log2h4h3|clpp|
+--------------------+--------------------+----------+--------------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+----+
|-7739811754326032609|-8580726767903703361|         1|               COLOC|                         9|1.982276920469156...|1.341592281969666...|3.353873509699807...|  0.0217208566430716|  0.9779437558698183|5.4925985869166185|null|
|-

In [16]:
print(significant_coloc.df.count())



32429


                                                                                

In [53]:
# Show the schema of the colocalization results
schema = Colocalisation.get_schema()
#print(schema)
print(type(significant_coloc))
print(type(significant_coloc.df))


<class 'gentropy.dataset.colocalisation.Colocalisation'>
<class 'pyspark.sql.dataframe.DataFrame'>


In [17]:
# Convert pyspark dataframe to pandas dataframe

coloc_pd_df = significant_coloc.df.toPandas()


                                                                                

In [18]:
# Show the first 5 rows of the pandas dataframe
coloc_pd_df.head(5)

# Summary of df
#coloc_pd_df.describe()

Unnamed: 0,chromosome,leftStudyLocusId,rightStudyLocusId,numberColocalisingVariants,h0,h1,h2,h3,h4,log2h4h3,colocalisationMethod
0,1,-7739811754326032609,-8580726767903703361,9,1.982277e-12,1.341592e-10,0.0003353874,0.021721,0.977944,5.492599,COLOC
1,1,-7180920194711895424,-8803538001521518159,2,1.445687e-14,9.711416e-10,2.300574e-08,0.000546,0.999454,10.838132,COLOC
2,1,-7145299275585606737,-8780263499383943526,9,2.214284e-13,2.477526e-10,8.790068e-06,0.008844,0.991147,6.80827,COLOC
3,1,-6982916822936356932,-8404678411441676549,3,5.869043e-15,2.295631e-06,7.678875e-12,0.002006,0.997992,8.958893,COLOC
4,1,-6976441487811867207,-9003725253470911662,5,2.482528e-42,9.714654e-35,3.752589e-11,0.000469,0.999531,11.057644,COLOC


#### Filter left-joined df

Remove all rows with no EFO entries in column "traitFromSourceMappedIds"

In [None]:
from pyspark.sql.functions import col, size

# Filter to keep rows with non-empty traitFromSourceMappedIds
filtered_df = left_joined_df.filter((col("traitFromSourceMappedIds").isNotNull()) & (size(col("traitFromSourceMappedIds")) > 0))

# Show the first 30 rows of the filtered DataFrame
filtered_df.show(30)

filtered_df.select('traitFromSourceMappedIds').distinct().show(30)
filtered_df.select('traitFromSourceMappedIds').distinct().count()



+--------------------+--------------------+--------------------+----------+---------+--------------------+----------+------+--------------+--------------+-------------------------------+-------------+-------------------+---------------+-----------------+----------------+------------------+------------+-----------+----------+-----+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+
|             studyId|        studyLocusId|           variantId|chromosome| position|              region|      beta|zScore|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|standardError|subStudyDescription|qualityControls|finemappingMetho

729