In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession.builder \
        .master('local[*]') \
        .config("spark.driver.memory", "15g") \
        .appName('spark') \
        .getOrCreate()

# Comparison between the 21.11/22.02 V2D data

In [34]:
# Read input files

study_nov = spark.read.json('/Users/irene/Documents/dev/datasets/genetics/21.10/study-index') # from gs://genetics-portal-dev-data/21.10/outputs/lut/study-index
study_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/studies.parquet') # from gs://genetics-portal-dev-staging/v2d/220210/studies.parquet

toploci_nov = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/21.10/toploci.parquet') # from gs://genetics-portal-dev-data/21.10/inputs/v2d/toploci.parquet
toploci_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/toploci.parquet') # from gs://genetics-portal-dev-staging/v2d/220210/toploci.parquet

l2g_nov = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/21.10/l2g') # from gs://genetics-portal-dev-data/21.10/outputs/l2g
l2g_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/l2g.full.220212.parquet') # from gs://genetics-portal-dev-staging/l2g/220212/predictions
vep_consequences = spark.read.csv('/Users/irene/Documents/dev/datasets/genetics/21.10/vep_consequences.tsv', sep='\t', header=True) # common / from gs://genetics-portal-dev-data/21.10/inputs/lut/vep_consequences.tsv

## 1. Study table

-  ✅ GWASCat, and only GWASCat, have more studies (14102 increase)
-  ✅ 10 studies have been lost. 8 of them are due to changes in the reported trait in GWAS. However, it seems that we should be picking `GCST010653` and `GCST010729`.
- **QUESTION**. What does it mean when the trait mentions `[conditional]`? For example, for [GCST90016674](https://www.ebi.ac.uk/gwas/studies/GCST90016674) Is it an underlying condition? ANSWER: those pvals are conditional to the lead variant so that independent associations are identified https://pubmed.ncbi.nlm.nih.gov/22426310

In [4]:
print('TOTAL Studies Nov:', study_nov.select('study_id').distinct().count())
print('TOTAL Studies Feb:', study_feb.select('study_id').distinct().count())
print('\n')
print('FINNGEN Studies Nov:', study_nov.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count())
print('FINNGEN Studies Feb:*', study_feb.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count())
print('\n')
print('GWASCAT Studies Nov:', study_nov.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count())
print('GWASCAT Studies Feb:', study_feb.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count())
print('\n')
print('UKBIOBANK Studies Nov:', study_nov.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count())
print('UKBIOBANK Studies Feb:', study_feb.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count())
print('\n *Finngen has an extra study (FINNGEN_R5_I9_HEARTFAIL_AND_CHD) that was missing because the mapping was not previously available.')
if not (
    study_feb.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count() +
    study_feb.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count() +
    study_feb.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count()
    == study_feb.select('study_id').distinct().count()):
    raise AssertionError ("Check study IDs. There's some mismatch.")



TOTAL Studies Nov: 36394
TOTAL Studies Feb: 50720


FINNGEN Studies Nov: 2924
FINNGEN Studies Feb:* 2925


GWASCAT Studies Nov: 30048
GWASCAT Studies Feb: 44373


UKBIOBANK Studies Nov: 3422
UKBIOBANK Studies Feb: 3422

 *Finngen has an extra study (FINNGEN_R5_I9_HEARTFAIL_AND_CHD) that was missing because the mapping was not previously available.


In [6]:
# Are we missing any study? Yes, 136. Most likely not problematic. --> UPDATE 14/02: only 10 are missing!

study_nov.select('study_id').distinct().join(study_feb.select('study_id').distinct(), on='study_id', how='left_anti').limit(20).collect()

[Row(study_id='GCST010653'),
 Row(study_id='GCST011066_2'),
 Row(study_id='GCST010729'),
 Row(study_id='GCST012085_2'),
 Row(study_id='GCST012086_3'),
 Row(study_id='GCST012083_2'),
 Row(study_id='GCST012084_3'),
 Row(study_id='GCST012173_6'),
 Row(study_id='GCST002299_2'),
 Row(study_id='GCST005362_2')]

## 2. Top loci table

- ✅  Number of lead variants have increased with the studies increase (12914) 

- 🚨  229 lead variants have been lost. **TODO**: Check these cases, as they seem to be significant assocs. The average pValue exponent of the missing loci is ~11. Some examples:
  - Check `GCST90014290`:
    - `6_29954890_C_T` (rs28780071-T) is missing while the pValue is significant. It is present at https://www.ebi.ac.uk/gwas/studies/GCST90014290
    - `2_16507491_G_T` (rs4240228) is missing and it is not at the Catalog. That is why we have 25 records for that study and they have 24.
  - Check `GCST90000050`:
    - We have 73 variants whereas GWASCat show 88. https://www.ebi.ac.uk/gwas/studies/GCST90000050

- 🚨 `GCST90091061`, `GCST010653`, and `GCST010729` are not in the study table. All of them available at GWASCat.

**High-level metrics**

TOTAL Top Loci Nov: 129507

TOTAL Top Loci Feb: 142421


TOTAL Studies in Top Loci Nov: 18330

TOTAL Studies in Top Loci Feb: 19777

In [7]:
toploci_nov = toploci_nov.withColumn('variant', F.concat(F.col('chrom'), F.lit('_'), F.col('pos'), F.lit('_'), F.col('ref'), F.lit('_'), F.col('alt'))).select('study_id', 'variant', 'pval_mantissa', 'pval_exponent').distinct()
toploci_feb = toploci_feb.withColumn('variant', F.concat(F.col('chrom'), F.lit('_'), F.col('pos'), F.lit('_'), F.col('ref'), F.lit('_'), F.col('alt'))).select('study_id', 'variant', 'pval_mantissa', 'pval_exponent').distinct()

In [8]:
print('TOTAL Top Loci Nov:', toploci_nov.select('variant').distinct().count())
print('TOTAL Top Loci Feb:', toploci_feb.select('variant').distinct().count())
print('\n')

print('TOTAL Studies in Top Loci Nov:', toploci_nov.select('study_id').distinct().count())
print('TOTAL Studies in Top Loci Feb:', toploci_feb.select('study_id').distinct().count())

TOTAL Top Loci Nov: 129507
TOTAL Top Loci Feb: 142734


TOTAL Studies in Top Loci Nov: 18330
TOTAL Studies in Top Loci Feb: 19885


In [9]:
if (
    # Assert there are no studies in the toploci outside the study table
    toploci_feb.select('study_id').join(study_feb, on='study_id', how='left_anti').distinct().count() > 0
):
    errors = toploci_feb.select('study_id').join(study_feb, on='study_id', how='left_anti').distinct().collect()
    raise AssertionError (f'''
    There are studies in the top loci outside the study table,
    {errors}
    ''')

AssertionError: 
    There are studies in the top loci outside the study table,
    [Row(study_id='GCST010653'), Row(study_id='GCST010729')]
    

In [10]:
# Do the inverse check

if (
    # Assert there is no mismatch between the number of associations between study and top loci table
    study_feb.filter(F.col('num_assoc_loci') > 0).join(toploci_feb, on='study_id', how='left_anti').select('study_id').distinct().count() > 0
):
    errors = study_feb.filter(F.col('num_assoc_loci') > 0).join(toploci_feb, on='study_id', how='left_anti').select('study_id').distinct().collect()
    raise AssertionError (f'''
    There are studies that show association in the study table that are not found in the top loci,
    {errors}
    ''')

In [15]:
# Any lead variants that went missing? Yes, 229. Should be checked.

print(toploci_nov.join(toploci_feb.select('variant').distinct(), on='variant', how='left_anti').distinct().show())

print('\n These seem to be significant associations. The average pValue exponent of the missing loci is ~11.')
print(toploci_nov.join(toploci_feb.select('variant').distinct(), on='variant', how='left_anti').agg({'pval_exponent': 'avg'}).show())

+--------------------+--------------+-------------+-------------+
|             variant|      study_id|pval_mantissa|pval_exponent|
+--------------------+--------------+-------------+-------------+
|     13_27819203_G_A|  GCST90000047|          5.0|           -9|
|10_21709230_C_CTT...|  GCST90016668|          6.0|          -10|
|     10_9941388_C_CA|  GCST90000047|          3.0|           -9|
| 10_49105841_G_GTATA|  GCST90016667|          7.0|          -10|
|   1_153848973_A_ACC|  GCST90000047|          3.0|          -10|
|   6_126745342_G_GAA|  GCST90016666|          1.0|          -12|
|     11_78209884_A_G|  GCST90013405|          3.0|          -10|
|      6_31175734_G_A|  GCST90014288|          5.0|          -10|
|   3_47045779_ATTT_A|  GCST90016667|          2.0|           -8|
|  3_85601191_T_TACAC|  GCST90000047|          1.0|          -28|
| 1_50931892_AATAAT_A|  GCST90016675|          3.0|          -13|
|     6_124729847_C_G|  GCST90000047|          5.0|          -10|
|  10_2170

In [16]:
# 25 loci for GCST90014290
toploci_feb.filter(F.col('study_id') == 'GCST90014290').show(30)


+------------+----------------+------------------+-------------+
|    study_id|         variant|     pval_mantissa|pval_exponent|
+------------+----------------+------------------+-------------+
|GCST90014290|  2_16507491_G_T|               2.5|          -14|
|GCST90014290|  3_47674055_T_C|             1.275|          -10|
|GCST90014290| 11_57344219_T_G|               3.7|          -14|
|GCST90014290| 3_150283437_A_G|1.3999999999999997|          -14|
|GCST90014290| 3_128617455_G_T|             5.293|          -11|
|GCST90014290|  6_18105845_T_G| 6.649000000000001|          -32|
|GCST90014290|12_106949598_T_C|              1.14|           -9|
|GCST90014290|  1_38991334_C_T|             4.352|          -12|
|GCST90014290| 21_37001879_G_T|              3.71|          -16|
|GCST90014290| 3_160499695_G_A| 6.280000000000001|          -13|
|GCST90014290|   1_8850051_A_G|             6.728|          -11|
|GCST90014290| 3_169141218_T_C|             4.246|           -9|
|GCST90014290|  6_1812079

In [22]:
# 73 loci for GCST90000050 and 88 on GWAScat
toploci_feb.filter(F.col('study_id') == 'GCST90000050').count()

73

In [18]:
# Any studies in the top loci table went missing? Yes, 38. The examples don't look very relevant.

print(toploci_nov.join(toploci_feb.select('study_id').distinct(), on='study_id', how='left_anti').select('study_id').distinct().show())

print(study_feb.filter(F.col('study_id').startswith('GCST90016674')).show(2, False, True))

+--------------+
|      study_id|
+--------------+
|GCST90016666_2|
|GCST90016674_2|
|GCST90016675_2|
|GCST90016667_2|
|  GCST012085_2|
|  GCST005362_2|
|  GCST012086_2|
|GCST90016673_2|
|  GCST002299_2|
|  GCST012173_6|
+--------------+

None
-RECORD 0-------------------------------------------------------------------------------------------------------
 study_id             | GCST90016674                                                                            
 pmid                 | PMID:34128465                                                                           
 pub_date             | 2021-06-15                                                                              
 pub_journal          | Elife                                                                                   
 pub_title            | Genetic architecture of 11 organ traits derived from abdominal MRI using deep learning. 
 pub_author           | Liu Y                                                 