In [1]:
import pandas as pd

from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/10 12:54:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Data

List of HGVS identifier -> list of target genes provided by VEP
- 6731 variants. 608 with more than 5 genes. 1190 with more than 3 genes.
  - If we go for the cutoff of 3 I guess extracting their impact is no longer of interest.
- 3770 variants with uncertain coordinates. Most of the cases (3510) with unknown bounds.
  - We want to keep them but taking a conservative approach.
  - Problem in indexing these. We will have multiple ids that actually refer to the same phenomenom.
- 177 variants are mapped to a target different than Ensembl (LRG).

### Questions
1. Do we want to establish a threshold for a maximum nmber of targets to be used? Yes. Most likely a cutoff of 3. We will be dropping the more promiscuous cases.
2. How many of all the complex variants were accessible by VEP?



In [None]:
import json

with open('data/vep-annotated-complex.json') as f:
    data = json.load(f)

data.keys()

In [15]:
var = pd.DataFrame({
    "variantId": list(data.keys()),
    "targetIds": list(data.values())
  })
# var.head()



In [16]:
var_df = spark.createDataFrame(var)
var_df.show(5)

+--------------------+--------------------+
|           variantId|           targetIds|
+--------------------+--------------------+
|NC_000001.11:g.20...|   [ENSG00000075151]|
|NC_000006.12:g.10...|[ENSG00000230314,...|
|NC_000002.12:g.23...|[ENSG00000163295,...|
|NC_000019.10:g.14...|[ENSG00000115257,...|
|NC_000006.12:g.52...|   [ENSG00000244067]|
+--------------------+--------------------+
only showing top 5 rows



In [19]:
var_df = var_df.withColumn('s', F.size(F.col('targetIds')))

In [28]:
var_df.orderBy(F.col('s').desc()).show(5)

+--------------------+--------------------+---+
|           variantId|           targetIds|  s|
+--------------------+--------------------+---+
|NC_000013.11:g.46...|[ENSG00000232954,...|437|
|NC_000015.10:g.22...|[ENSG00000259905,...|251|
|NC_000005.10:g.(5...|[ENSG00000249238,...|232|
|NC_000017.11:g.(?...|[ENSG00000205266,...|230|
|NC_000021.9:g.(10...|[ENSG00000232193,...|220|
+--------------------+--------------------+---+
only showing top 5 rows



In [30]:
var_df_cutoff = var_df.filter(F.col('s') < 3)
var_df_cutoff.show(5, truncate=False)



+--------------------------------------------------+----------------------------------+---+
|variantId                                         |targetIds                         |s  |
+--------------------------------------------------+----------------------------------+---+
|NC_000001.11:g.20810811_20829210del               |[ENSG00000075151]                 |1  |
|NC_000006.12:g.52751643_52757905del               |[ENSG00000244067]                 |1  |
|NC_000023.11:g.124061012_124094911del             |[ENSG00000101972]                 |1  |
|NC_000022.11:g.(?_41092585)_(41093099_41117186)del|[ENSG00000100393, ENSG00000284015]|2  |
|NC_000019.10:g.(?_7112255)_(7249328_?)del         |[ENSG00000171105]                 |1  |
+--------------------------------------------------+----------------------------------+---+
only showing top 5 rows



In [34]:
var_df_cutoff.filter(F.col('variantId').contains('?')).count()

3510

In [38]:
var_df.withColumn('targetId', F.explode(F.col('targetIds'))).filter(~F.col('targetId').contains('ENS')).select('variantId', 'targetIds').distinct().show(5)

+--------------------+----------+
|           variantId| targetIds|
+--------------------+----------+
|LRG_292:g.96347_1...| [LRG_292]|
|LRG_1121:g.69063_...|[LRG_1121]|
|LRG_487:g.(19968_...| [LRG_487]|
|LRG_292:g.142345_...| [LRG_292]|
|LRG_292:g.139830_...| [LRG_292]|
+--------------------+----------+
only showing top 5 rows

