In [3]:
import pandas as pd

from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/12 12:03:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/12 12:03:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/12 12:03:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/12 12:03:24 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## Preliminary data

List of HGVS identifier -> list of target genes provided by VEP
- 6731 variants. 608 with more than 5 genes. 1190 with more than 3 genes.
  - If we go for the cutoff of 3 I guess extracting their impact is no longer of interest.
- 3770 variants with uncertain coordinates. Most of the cases (3510) with unknown bounds.
  - We want to keep them but taking a conservative approach.
  - Problem in indexing these. We will have multiple ids that actually refer to the same phenomenom.
- 177 variants are mapped to a target different than Ensembl (LRG).

### Questions
1. Do we want to establish a threshold for a maximum nmber of targets to be used? Yes. Most likely a cutoff of 3. We will be dropping the more promiscuous cases.
2. How many of all the complex variants were accessible by VEP?

April's notebook to generate this data is available here: http://nbviewer.org/github/EBIvariation/eva-opentargets/blob/master/data-exploration/complex-events/notebooks/hgvs-follow-up.ipynb#Target-genes-from-HGVS-spans



In [4]:
import json

with open('data/vep-annotated-complex.json') as f:
    data = json.load(f)

var = pd.DataFrame({
    "variantId": list(data.keys()),
    "targetIds": list(data.values())
  })

var_df = spark.createDataFrame(var)
var_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|           variantId|           targetIds|
+--------------------+--------------------+
|NC_000001.11:g.20...|   [ENSG00000075151]|
|NC_000006.12:g.10...|[ENSG00000230314,...|
|NC_000002.12:g.23...|[ENSG00000163295,...|
|NC_000019.10:g.14...|[ENSG00000115257,...|
|NC_000006.12:g.52...|   [ENSG00000244067]|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [5]:
var_df = var_df.withColumn('s', F.size(F.col('targetIds')))

In [8]:
var_df.orderBy(F.col('s').desc()).show(5)

+--------------------+--------------------+---+
|           variantId|           targetIds|  s|
+--------------------+--------------------+---+
|NC_000013.11:g.46...|[ENSG00000232954,...|437|
|NC_000015.10:g.22...|[ENSG00000259905,...|251|
|NC_000005.10:g.(5...|[ENSG00000249238,...|232|
|NC_000017.11:g.(?...|[ENSG00000205266,...|230|
|NC_000021.9:g.(10...|[ENSG00000232193,...|220|
+--------------------+--------------------+---+
only showing top 5 rows



In [10]:
var_df_cutoff = var_df.filter(F.col('s') < 3)
var_df_cutoff.show(5, truncate=False)



+--------------------------------------------------+----------------------------------+---+
|variantId                                         |targetIds                         |s  |
+--------------------------------------------------+----------------------------------+---+
|NC_000001.11:g.20810811_20829210del               |[ENSG00000075151]                 |1  |
|NC_000006.12:g.52751643_52757905del               |[ENSG00000244067]                 |1  |
|NC_000023.11:g.124061012_124094911del             |[ENSG00000101972]                 |1  |
|NC_000022.11:g.(?_41092585)_(41093099_41117186)del|[ENSG00000100393, ENSG00000284015]|2  |
|NC_000019.10:g.(?_7112255)_(7249328_?)del         |[ENSG00000171105]                 |1  |
+--------------------------------------------------+----------------------------------+---+
only showing top 5 rows



In [25]:
accurate = var_df_cutoff.filter(~F.col('variantId').contains('?')).filter(~F.col('variantId').contains('('))

accurate.withColumn('first', F.split('variantId', '_')[0]).select('first').distinct().show(3)

+-----+
|first|
+-----+
|   NC|
|  LRG|
|   NW|
+-----+



In [26]:
accurate.select('variantId').show(truncate=False)

+-------------------------------------+
|variantId                            |
+-------------------------------------+
|NC_000001.11:g.20810811_20829210del  |
|NC_000006.12:g.52751643_52757905del  |
|NC_000023.11:g.124061012_124094911del|
|NC_000015.10:g.27878517_28001142del  |
|NC_000004.12:g.93013415_93157863del  |
|NC_000004.12:g.93098691_93135614del  |
|NC_000004.12:g.92559959_92610106del  |
|NC_000001.11:g.15457276_15470313del  |
|NC_000001.11:g.35902179_35922053del  |
|NC_000001.11:g.111383531_111401451del|
|NC_000001.11:g.191864689_191918605del|
|NC_000001.11:g.196854170_196893439del|
|NC_000002.12:g.35245661_35372253del  |
|NC_000002.12:g.51118336_51183988del  |
|NC_000002.12:g.153845295_154275106dup|
|NC_000003.12:g.89335416_89368021del  |
|NC_000003.12:g.96585044_96601120del  |
|NC_000003.12:g.136430297_136541880dup|
|NC_000003.12:g.176175294_176189554del|
|NC_000004.12:g.42695536_42706081del  |
+-------------------------------------+
only showing top 20 rows



In [38]:
var_df.withColumn('targetId', F.explode(F.col('targetIds'))).filter(~F.col('targetId').contains('ENS')).select('variantId', 'targetIds').distinct().show(5)

+--------------------+----------+
|           variantId| targetIds|
+--------------------+----------+
|LRG_292:g.96347_1...| [LRG_292]|
|LRG_1121:g.69063_...|[LRG_1121]|
|LRG_487:g.(19968_...| [LRG_487]|
|LRG_292:g.142345_...| [LRG_292]|
|LRG_292:g.139830_...| [LRG_292]|
+--------------------+----------+
only showing top 5 rows



## Data from EVA's 22.04 submission

Conclusion: incremental increase in all the metrics I've checked without any noticeable data loss.
Interestingly, HGVS is present in almost all of the evidence even for those with an imprecise location.

In [6]:
submission = spark.read.json('data/cttv012-2022-04-07.json.gz')

eva_df = submission.filter(F.col('datasourceId') == 'eva')
somatic_df = submission.filter(F.col('datasourceId') == 'eva_somatic')

                                                                                

In [11]:
# 30k increase in evidence / 100 for somatic evidence
print(submission.count())
print(eva_df.count()) # 1140083
print(somatic_df.count()) # 13447

                                                                                

1186147


                                                                                

1172590


[Stage 26:>                                                         (0 + 1) / 1]

13557


                                                                                

In [9]:
# Practically 100% of the evidence have a HGVS ID - This is a better coverage than variant ID (1186126 vs 1184631)

submission.filter(F.col('variantHgvsId').isNotNull()).count()

                                                                                

1186126

In [15]:
submission.filter((F.col('variantHgvsId').isNotNull()) | (F.col('variantId').isNotNull())).count()

                                                                                

1186135

In [14]:
submission.filter(F.col('variantHgvsId').isNull()).show(5, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
submission.withColumn('lit', F.explode('literature')).select('lit').distinct().count()

                                                                                

24175

In [20]:
submission.select('diseaseFromSourceMappedid').distinct().count()

                                                                                

5194

In [23]:
submission.filter(F.col('variantHgvsId').contains('?')).select('variantHgvsId').distinct().show(100, truncate=False)

[Stage 64:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------+
|variantHgvsId                                                |
+-------------------------------------------------------------+
|NC_000009.12:g.27573529_27573534GGCCCC[(24_?)]               |
|NM_002973.3:c.496_498CAG([33_?])                             |
|NM_002024.6:c.-128GGM[55_?]                                  |
|NG_011716.2:g.54765TGC[(51_?)]                               |
|NC_000008.11:g.104588972GCC[(90_?)]                          |
|NM_002111.8:c.52CAR[(?_26)]                                  |
|NC_000009.12:g.(27573529_27573546)ins(360_?)                 |
|NG_009797.1:g.6396CWG[(41_?)]                                |
|NC_000005.10:g.10356348_10356407TTTTA[(9_?)]TTTCA[(791_1035)]|
|NM_005716.4:c.-211GGC[(73_?)]                                |
|NC_000009.12:g.130681606CGC[(15_?)]                          |
|NC_000019.10:g.45770205CAG[(51_?)]                           |
|NC_000023.11:g.147912052GGC[(200_?)]   

                                                                                