In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession.builder \
        .master('local[*]') \
        .config("spark.driver.memory", "15g") \
        .appName('spark') \
        .getOrCreate()

# Comparison between the 21.11/22.02 V2D data

In [46]:
# Read input files

study_nov = spark.read.json('/Users/irene/Documents/dev/datasets/genetics/21.10/study-index') # from gs://genetics-portal-dev-data/21.10/outputs/lut/study-index
study_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/studies.parquet') # from gs://genetics-portal-dev-staging/v2d/220208/studies.parquet

toploci_nov = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/21.10/toploci.parquet') # from gs://genetics-portal-dev-data/21.10/inputs/v2d/toploci.parquet
toploci_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/toploci.parquet') # from gs://genetics-portal-dev-staging/v2d/220208/toploci.parquet

l2g_nov = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/21.10/l2g') # from gs://genetics-portal-dev-data/21.10/outputs/l2g
l2g_feb = spark.read.parquet('/Users/irene/Documents/dev/datasets/genetics/22.02/l2g.full.220208.parquet')
vep_consequences = spark.read.csv('/Users/irene/Documents/dev/datasets/genetics/21.10/vep_consequences.tsv', sep='\t', header=True) # common / from gs://genetics-portal-dev-data/21.10/inputs/lut/vep_consequences.tsv

## 1. Study table

-  ✅ GWASCat, and only GWASCat, have more studies (14102 increase)
-  ✅ 136 studies have been lost. After seeing ~10 examples, I reckon this is coming from source. They are no longer at GWASCat.
- **QUESTION**. What does it mean when the trait mentions `[conditional]`? For example, for [GCST90016674](https://www.ebi.ac.uk/gwas/studies/GCST90016674) Is it an underlying condition? Answer: those pvals are conditional to the lead variant so that independent associations are identified https://pubmed.ncbi.nlm.nih.gov/22426310

In [4]:
print('TOTAL Studies Nov:', study_nov.select('study_id').distinct().count())
print('TOTAL Studies Feb:', study_feb.select('study_id').distinct().count())
print('\n')
print('FINNGEN Studies Nov:', study_nov.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count())
print('FINNGEN Studies Feb:*', study_feb.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count())
print('\n')
print('GWASCAT Studies Nov:', study_nov.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count())
print('GWASCAT Studies Feb:', study_feb.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count())
print('\n')
print('UKBIOBANK Studies Nov:', study_nov.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count())
print('UKBIOBANK Studies Feb:', study_feb.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count())
print('\n *Finngen has an extra study (FINNGEN_R5_I9_HEARTFAIL_AND_CHD) that was missing because the mapping was not previously available.')
if not (
    study_feb.filter(F.col('study_id').startswith('FINN')).select('study_id').distinct().count() +
    study_feb.filter(F.col('study_id').startswith('GCST')).select('study_id').distinct().count() +
    study_feb.filter((F.col('study_id').startswith('NEALE')) | (F.col('study_id').startswith('SAIGE'))).select('study_id').distinct().count()
    == study_feb.select('study_id').distinct().count()):
    raise AssertionError ("Check study IDs. There's some mismatch.")



TOTAL Studies Nov: 36394
TOTAL Studies Feb: 50543


FINNGEN Studies Nov: 2924
FINNGEN Studies Feb:* 2925


GWASCAT Studies Nov: 30048
GWASCAT Studies Feb: 44196


UKBIOBANK Studies Nov: 3422
UKBIOBANK Studies Feb: 3422

 *Finngen has an extra study (FINNGEN_R5_I9_HEARTFAIL_AND_CHD) that was missing because the mapping was not previously available.


In [6]:
# Are we missing any study? Yes, 136. Most likely not problematic.

study_nov.select('study_id').distinct().join(study_feb.select('study_id').distinct(), on='study_id', how='left_anti').limit(20).collect()

[Row(study_id='GCST90007218'),
 Row(study_id='GCST90007301'),
 Row(study_id='GCST90007229'),
 Row(study_id='GCST90007267'),
 Row(study_id='GCST90007273'),
 Row(study_id='GCST90007299'),
 Row(study_id='GCST90007248'),
 Row(study_id='GCST90007296'),
 Row(study_id='GCST90007238'),
 Row(study_id='GCST90007259'),
 Row(study_id='GCST90007294'),
 Row(study_id='GCST90007295'),
 Row(study_id='GCST90007205'),
 Row(study_id='GCST90007252'),
 Row(study_id='GCST90007183'),
 Row(study_id='GCST90007211'),
 Row(study_id='GCST90007224'),
 Row(study_id='GCST90007226'),
 Row(study_id='GCST90007241'),
 Row(study_id='GCST90007189')]

## 2. Top loci table

- ✅  Number of lead variants have increased with the studies increase (12914) 

- 🚨  229 lead variants have been lost. **TODO**: Check these cases, as they seem to be significant assocs. The average pValue exponent of the missing loci is ~11. Some examples:
  - Check `GCST90014290`:
    - `6_29954890_C_T` (rs28780071-T) is missing while the pValue is significant. It is present at https://www.ebi.ac.uk/gwas/studies/GCST90014290
    - `2_16507491_G_T` (rs4240228) is missing and it is not at the Catalog. That is why we have 25 records for that study and they have 24.
  - Check `GCST90000050`:
    - We have 73 variants whereas GWASCat show 88. https://www.ebi.ac.uk/gwas/studies/GCST90000050

- 🚨 `GCST90091061`, `GCST010653`, and `GCST010729` are not in the study table. All of them available at GWASCat.

**High-level metrics**

TOTAL Top Loci Nov: 129507

TOTAL Top Loci Feb: 142421


TOTAL Studies in Top Loci Nov: 18330

TOTAL Studies in Top Loci Feb: 19777

In [40]:
toploci_nov = toploci_nov.withColumn('variant', F.concat(F.col('chrom'), F.lit('_'), F.col('pos'), F.lit('_'), F.col('ref'), F.lit('_'), F.col('alt'))).select('study_id', 'variant', 'pval_mantissa', 'pval_exponent').distinct()
toploci_feb = toploci_feb.withColumn('variant', F.concat(F.col('chrom'), F.lit('_'), F.col('pos'), F.lit('_'), F.col('ref'), F.lit('_'), F.col('alt'))).select('study_id', 'variant', 'pval_mantissa', 'pval_exponent').distinct()

In [41]:
print('TOTAL Top Loci Nov:', toploci_nov.select('variant').distinct().count())
print('TOTAL Top Loci Feb:', toploci_feb.select('variant').distinct().count())
print('\n')

print('TOTAL Studies in Top Loci Nov:', toploci_nov.select('study_id').distinct().count())
print('TOTAL Studies in Top Loci Feb:', toploci_feb.select('study_id').distinct().count())

TOTAL Top Loci Nov: 129507
TOTAL Top Loci Feb: 142734


TOTAL Studies in Top Loci Nov: 18330
TOTAL Studies in Top Loci Feb: 19885


In [42]:
if (
    # Assert there are no studies in the toploci outside the study table
    toploci_feb.select('study_id').join(study_feb, on='study_id', how='left_anti').distinct().count() > 0
):
    errors = toploci_feb.select('study_id').join(study_feb, on='study_id', how='left_anti').distinct().collect()
    raise AssertionError (f'''
    There are studies in the top loci outside the study table,
    {errors}
    ''')

AssertionError: 
    There are studies in the top loci outside the study table,
    [Row(study_id='GCST90007218'), Row(study_id='GCST90007229'), Row(study_id='GCST90007267'), Row(study_id='GCST90007273'), Row(study_id='GCST90007299'), Row(study_id='GCST90007248'), Row(study_id='GCST90090992'), Row(study_id='GCST90093295_2'), Row(study_id='GCST90044700'), Row(study_id='GCST90086157'), Row(study_id='GCST90093299_2'), Row(study_id='GCST90007296'), Row(study_id='GCST90093356'), Row(study_id='GCST90026488'), Row(study_id='GCST90085906'), Row(study_id='GCST012637'), Row(study_id='GCST90007238'), Row(study_id='GCST90093323'), Row(study_id='GCST90007294'), Row(study_id='GCST90085927'), Row(study_id='GCST90093353'), Row(study_id='GCST90091246'), Row(study_id='GCST90093298'), Row(study_id='GCST90044896'), Row(study_id='GCST90093293'), Row(study_id='GCST90093295'), Row(study_id='GCST90007205'), Row(study_id='GCST90007252'), Row(study_id='GCST90093299'), Row(study_id='GCST90007183'), Row(study_id='GCST90007211'), Row(study_id='GCST90007224'), Row(study_id='GCST90027164'), Row(study_id='GCST90093348'), Row(study_id='GCST90007226'), Row(study_id='GCST90007241'), Row(study_id='GCST90093359'), Row(study_id='GCST90007189'), Row(study_id='GCST90007300'), Row(study_id='GCST90093321'), Row(study_id='GCST012676_4'), Row(study_id='GCST90007271'), Row(study_id='GCST90093351'), Row(study_id='GCST90093360'), Row(study_id='GCST90092525'), Row(study_id='GCST90093351_2'), Row(study_id='GCST90092518'), Row(study_id='GCST90026489'), Row(study_id='GCST90090980'), Row(study_id='GCST90093294_2'), Row(study_id='GCST90091207'), Row(study_id='GCST90093093'), Row(study_id='GCST90007206'), Row(study_id='GCST90007258'), Row(study_id='GCST90093300_2'), Row(study_id='GCST90007240'), Row(study_id='GCST90007245'), Row(study_id='GCST90093099'), Row(study_id='GCST90007257'), Row(study_id='GCST90007288'), Row(study_id='GCST90019040'), Row(study_id='GCST90091206'), Row(study_id='GCST90007277'), Row(study_id='GCST90093102'), Row(study_id='GCST90007246'), Row(study_id='GCST90007204'), Row(study_id='GCST90007261'), Row(study_id='GCST90007263'), Row(study_id='GCST90093296'), Row(study_id='GCST90007303'), Row(study_id='GCST90007247'), Row(study_id='GCST90007181'), Row(study_id='GCST90085915'), Row(study_id='GCST90007185'), Row(study_id='GCST90007239'), Row(study_id='GCST90007243'), Row(study_id='GCST90019017'), Row(study_id='GCST012638'), Row(study_id='GCST012616'), Row(study_id='GCST90007250'), Row(study_id='GCST90093115'), Row(study_id='GCST90086148'), Row(study_id='GCST90091828'), Row(study_id='GCST90017143'), Row(study_id='GCST90093297'), Row(study_id='GCST90093303_2'), Row(study_id='GCST90007244'), Row(study_id='GCST90007191'), Row(study_id='GCST90007195'), Row(study_id='GCST90093113'), Row(study_id='GCST90092523'), Row(study_id='GCST90093298_2'), Row(study_id='GCST90007203'), Row(study_id='GCST90090990'), Row(study_id='GCST010653'), Row(study_id='GCST90092520'), Row(study_id='GCST90093300'), Row(study_id='GCST90061528'), Row(study_id='GCST90091243'), Row(study_id='GCST90007253'), Row(study_id='GCST90007199'), Row(study_id='GCST90086149'), Row(study_id='GCST90020048'), Row(study_id='GCST90085917'), Row(study_id='GCST90091208'), Row(study_id='GCST90007265'), Row(study_id='GCST90093098'), Row(study_id='GCST90007233'), Row(study_id='GCST90007270'), Row(study_id='GCST90007249'), Row(study_id='GCST90092003'), Row(study_id='GCST90093287'), Row(study_id='GCST90093090'), Row(study_id='GCST90090991'), Row(study_id='GCST90093289'), Row(study_id='GCST90093301'), Row(study_id='GCST90007236'), Row(study_id='GCST90085909'), Row(study_id='GCST90085914'), Row(study_id='GCST90093297_2'), Row(study_id='GCST90093296_2'), Row(study_id='GCST90093294'), Row(study_id='GCST90007201'), Row(study_id='GCST90093349'), Row(study_id='GCST90026490'), Row(study_id='GCST90061526'), Row(study_id='GCST90007291'), Row(study_id='GCST90093121'), Row(study_id='GCST90085916'), Row(study_id='GCST90007292'), Row(study_id='GCST90027027'), Row(study_id='GCST90007187'), Row(study_id='GCST90093100'), Row(study_id='GCST90026494'), Row(study_id='GCST90007215'), Row(study_id='GCST90093324'), Row(study_id='GCST90093116'), Row(study_id='GCST90092521'), Row(study_id='GCST90093310_2'), Row(study_id='GCST90007182'), Row(study_id='GCST90007200'), Row(study_id='GCST90007264'), Row(study_id='GCST012656'), Row(study_id='GCST90007260'), Row(study_id='GCST90085911'), Row(study_id='GCST90007202'), Row(study_id='GCST90026491'), Row(study_id='GCST90093303'), Row(study_id='GCST90007269'), Row(study_id='GCST90007306'), Row(study_id='GCST90091825'), Row(study_id='GCST90026493'), Row(study_id='GCST90085924'), Row(study_id='GCST90086172'), Row(study_id='GCST90007305'), Row(study_id='GCST90007210'), Row(study_id='GCST90007234'), Row(study_id='GCST90090989'), Row(study_id='GCST90007231'), Row(study_id='GCST90007225'), Row(study_id='GCST90093310'), Row(study_id='GCST90007268'), Row(study_id='GCST90086172_2'), Row(study_id='GCST90093142'), Row(study_id='GCST90007186'), Row(study_id='GCST90007209'), Row(study_id='GCST90007208'), Row(study_id='GCST90026492'), Row(study_id='GCST90093317'), Row(study_id='GCST012676_7'), Row(study_id='GCST90026371'), Row(study_id='GCST90093316'), Row(study_id='GCST90093320'), Row(study_id='GCST90090974'), Row(study_id='GCST90007216'), Row(study_id='GCST90093354'), Row(study_id='GCST90007255'), Row(study_id='GCST90093097'), Row(study_id='GCST90007214'), Row(study_id='GCST90007237'), Row(study_id='GCST90007274'), Row(study_id='GCST90007266'), Row(study_id='GCST90007193'), Row(study_id='GCST90061529'), Row(study_id='GCST90002250'), Row(study_id='GCST010729'), Row(study_id='GCST90093322'), Row(study_id='GCST90093352'), Row(study_id='GCST90093101'), Row(study_id='GCST90091211'), Row(study_id='GCST90093302'), Row(study_id='GCST90091054'), Row(study_id='GCST90007275'), Row(study_id='GCST012619'), Row(study_id='GCST90007194'), Row(study_id='GCST90007254'), Row(study_id='GCST90093319')]
    

In [26]:
# Do the inverse check

if (
    # Assert there is no mismatch between the number of associations between study and top loci table
    study_feb.filter(F.col('num_assoc_loci') > 0).join(toploci_feb, on='study_id', how='left_anti').select('study_id').distinct().count() > 0
):
    errors = study_feb.filter(F.col('num_assoc_loci') > 0).join(toploci_feb, on='study_id', how='left_anti').select('study_id').distinct().collect()
    raise AssertionError (f'''
    There are studies that show association in the study table that are not found in the top loci,
    {errors}
    ''')

In [27]:
# Any lead variants that went missing? Yes, 229. Should be checked.

toploci_nov.join(toploci_feb.select('variant').distinct(), on='variant', how='left_anti').show()

print('\n These seem to be significant associations. The average pValue exponent of the missing loci is ~11.')
print(toploci_nov.join(toploci_feb.select('variant').distinct(), on='variant', how='left_anti').agg({'pval_exponent': 'avg'}).show())

+--------------------+--------------+-------------+-------------+
|             variant|      study_id|pval_mantissa|pval_exponent|
+--------------------+--------------+-------------+-------------+
|   11_30342653_CAA_C|  GCST90000050|          3.0|           -8|
|     5_155521079_G_A|  GCST90000047|          3.0|          -13|
|      3_25594252_T_C|  GCST90000047|          3.0|           -8|
|      6_29954890_C_T|  GCST90014290|          4.0|           -9|
|   14_100840504_TC_T|  GCST90016669|          1.0|          -12|
|    4_158755103_C_CA|  GCST90000050|          1.0|           -8|
|     14_32829617_T_A|  GCST90000047|          2.0|           -8|
|5_56506300_TCAAGG...|  GCST90016671|          4.0|          -11|
|    3_160121123_GA_G|  GCST90016667|          3.0|           -8|
|     21_39269251_A_T|  GCST90000047|          3.0|           -8|
|      2_16521131_G_C|  GCST90014290|          1.0|          -12|
|    11_46599484_CA_C|  GCST90000047|          2.0|          -10|
|     8_14

In [77]:
# 25 loci for GCST90014290
toploci_feb.filter(F.col('study_id') == 'GCST90014290').show(30)


+------------+----------------+------------------+-------------+
|    study_id|         variant|     pval_mantissa|pval_exponent|
+------------+----------------+------------------+-------------+
|GCST90014290|  2_16507491_G_T|               2.5|          -14|
|GCST90014290|  3_47674055_T_C|             1.275|          -10|
|GCST90014290| 11_57344219_T_G|               3.7|          -14|
|GCST90014290| 3_150283437_A_G|1.3999999999999997|          -14|
|GCST90014290| 3_128617455_G_T|             5.293|          -11|
|GCST90014290|  6_18105845_T_G| 6.649000000000001|          -32|
|GCST90014290|12_106949598_T_C|              1.14|           -9|
|GCST90014290|  1_38991334_C_T|             4.352|          -12|
|GCST90014290| 21_37001879_G_T|              3.71|          -16|
|GCST90014290| 3_160499695_G_A| 6.280000000000001|          -13|
|GCST90014290|   1_8850051_A_G|             6.728|          -11|
|GCST90014290| 3_169141218_T_C|             4.246|           -9|
|GCST90014290|  6_1812079

In [28]:
# 73 loci for GCST90000050
toploci_feb.filter(F.col('study_id') == 'GCST90000050').count()

73

In [32]:
# Any studies in the top loci table went missing? Yes, 38. The examples don't look very relevant.

print(toploci_nov.join(toploci_feb.select('study_id').distinct(), on='study_id', how='left_anti').select('study_id').distinct().show())

print(study_feb.filter(F.col('study_id').startswith('GCST90016674')).show(2, False, True))

+--------------+
|      study_id|
+--------------+
|  GCST002295_4|
|    GCST001965|
|GCST90016666_2|
|  GCST010178_2|
|  GCST001860_2|
|    GCST001445|
|GCST90016674_2|
|    GCST004956|
|  GCST002295_2|
|  GCST001892_2|
|  GCST012308_2|
|  GCST001033_3|
|GCST90016675_2|
|    GCST003427|
|    GCST012013|
|    GCST005716|
|GCST90016667_2|
|    GCST001919|
|  GCST012085_2|
|    GCST000433|
+--------------+
only showing top 20 rows

None
-RECORD 0-------------------------------------------------------------------------------------------------------
 ancestry_initial     | [European=32858]                                                                        
 ancestry_replication | []                                                                                      
 has_sumstats         | true                                                                                    
 n_cases              | null                                                                                 