In [2]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = (SparkSession.builder
    .master('local[*]')
    .config("spark.driver.memory", "15g")
    .appName('spark')
    .getOrCreate())

In [3]:
tim = (spark.read.csv('data/DAILYMED_DISEASE_ChEMBL_ML_2.tsv', sep='\t', header=True)
    .select('chembl_id', 'efo_id', 'ref_url', 'method_by')
    .withColumn('efo_id', F.regexp_replace(F.col('efo_id'), ':', '_'))
    .withColumn('dailymed_id', F.element_at(F.split(F.col('ref_url'), '='), -1)).drop('ref_url').distinct()

    .filter(F.col('method_by') == 'ml').drop('method_by')
    )
tim.first()

Row(chembl_id='CHEMBL1751', efo_id='EFO_0003144', dailymed_id='58678fd6-afe3-4a4c-a37f-356444194cd6')

In [4]:
print(tim.distinct().count())
print(tim.select('dailymed_id').distinct().count())
tim.select('chembl_id', 'efo_id').distinct().count()

8469
3946


2832

In [5]:
'''
tim_expanded = (

    tim.join(disease_idx, disease_idx['id'] == tim['efo_id'], how='left')
        .withColumn('efo_id', F.coalesce('ids', F.array(F.col('efo_id'))))
        .withColumn('efo_id', F.explode('efo_id'))
        .distinct()
        .persist()

)
'''
;

''

In [6]:
# Current manual indications

ind = spark.read.parquet('data/indication')

In [7]:
# Extract dailymed refs

dailymed = (
    ind.withColumn('indication', F.explode('indications'))
        .withColumn('reference', F.explode('indication.references'))
        .filter(F.col('reference.source') == 'DailyMed')
        .withColumn('efo_id', F.col('indication.disease'))
        .withColumn('dailymed_id', F.explode('reference.ids'))
        .drop('approvedIndications', 'indications', 'references', 'reference', 'indication', 'indicationCount')
        .distinct()
)

dailymed.first()

Row(id='CHEMBL480', efo_id='EFO_0003948', dailymed_id='da982c79-4f29-467c-b237-1f13c2051c59')

In [8]:
print(dailymed.distinct().count())
print(dailymed.select('dailymed_id').distinct().count())
dailymed.select('id', 'efo_id').distinct().count()

24720
9510


2552

In [9]:
## Compare datasets

In [10]:
print(dailymed.join(tim, on='efo_id', how='left_anti').distinct().count())
print(tim.join(dailymed, on='efo_id', how='left_anti').select('efo_id').distinct().count())
# 2504 EFOs are lost
# 337 are gained

2504
337


In [11]:
print(dailymed.join(tim, dailymed['id'] == tim['chembl_id'], how='left_anti').select('id').distinct().count())
tim.join(dailymed, dailymed['id'] == tim['chembl_id'], how='left_anti').select('chembl_id').distinct().count()

# 516 chembl ids are lost
# 548 are gained

516


548

In [12]:
dailymed.join(tim, dailymed['id'] == tim['chembl_id'], how='left_anti').select('id', 'dailymed_id').first()

Row(id='CHEMBL480', dailymed_id='da982c79-4f29-467c-b237-1f13c2051c59')

In [13]:
dailymed.join(tim, on='dailymed_id', how='left_anti').select('dailymed_id').distinct().show(truncate=False)

+------------------------------------+
|dailymed_id                         |
+------------------------------------+
|52321fe5-c001-4e2d-8149-3197ddc13764|
|9b600839-6c1b-4000-a9c5-efd74586a68a|
|c0ec7e75-03b9-4b5b-ca8b-58cb0f65b5d9|
|a84712bf-221d-4366-9cf0-b8c19428c276|
|c18073be-d52a-4347-a8d1-6be11bcb7824|
|a3ae5cba-f495-4593-b8fc-5ada34790745|
|7b2638f5-8975-4259-be52-0da5c263bcc0|
|c8c7e092-0eb4-4945-934d-0b4cf664ad23|
|ef9044c1-eed3-4239-8f8b-64affd7a9ba0|
|de965605-268a-4479-86f7-84de949cf36f|
|b1397a52-d99f-44f0-8649-8ceb8e7b9b85|
|01484f7c-24c9-4c0a-8ae5-c483cb7e6aaa|
|f0c660da-4198-4d04-87e9-5d6a7b6f75cf|
|b03d1cc5-6142-474b-9670-a364393d4142|
|0ffab497-1988-426b-8c9e-3be653a203dd|
|854f3663-35e4-4aef-9dfa-7e023b70b8fd|
|8f8f4ece-de78-4c13-8d9f-c3bb7053019d|
|17561aae-f0bc-4ffc-a007-985b17e9baf0|
|06fbdb63-d32f-4212-9962-f1de15597dda|
|91d45442-4232-4111-9dd8-0d787085af6d|
+------------------------------------+
only showing top 20 rows



In [14]:
print(dailymed.join(tim, on='dailymed_id', how='left_anti').select('dailymed_id').distinct().count())
tim.join(dailymed, on='dailymed_id', how='left_anti').select('dailymed_id').distinct().count()

# 8710 references are lost
# 3146 are gained

8710


3146

In [15]:
print(dailymed.withColumnRenamed('id', 'chembl_id').join(tim, ['chembl_id', 'efo_id'], how='left_anti').select('chembl_id', 'efo_id').distinct().count())
tim.join(dailymed.withColumnRenamed('id', 'chembl_id'), ['chembl_id', 'efo_id'], how='left_anti').select('chembl_id', 'efo_id').distinct().count()

# 1848 drug/indication pairs are lost
# 2128 are gained

1848


2128

In [16]:
print(dailymed.select('id').distinct().count())
print(tim.select('chembl_id').distinct().count())

1170
1202


In [17]:
'''
# are the lost references the source of the lost chembl ids? no. There are only 42 diff chembl ids in lost_refs.

lost_refs = dailymed.join(tim, on='dailymed_id', how='left_anti').select('id').distinct()

lost_refs.select('id').distinct().join(tim.select('chembl_id').distinct(), tim['chembl_id'] == lost_refs['id'], how='left_anti').distinct().count()
'''
;

''

In [18]:
# Taking the manual curation as the baseline, I want to see if the annotation is related

efo_ancestors = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/21.09.5/diseases')
        .select(F.array(F.col('id')).alias('id'), 'ancestors')
        .withColumn('expanded_efo_id', F.array_union(F.col('id'), F.col('ancestors')))
        .withColumn('efo_id', F.explode('id'))
        .select('efo_id', 'expanded_efo_id')
)

tim_w_ancestors = tim.join(efo_ancestors, on='efo_id', how='left')
tim_w_ancestors.show(5)

+-----------+-------------+--------------------+--------------------+
|     efo_id|    chembl_id|         dailymed_id|     expanded_efo_id|
+-----------+-------------+--------------------+--------------------+
|EFO_0003144|   CHEMBL1751|58678fd6-afe3-4a4...|[EFO_0003144, MON...|
|EFO_0006505|CHEMBL1200735|7d74dfa6-0468-43a...|[EFO_0006505, MON...|
|EFO_0000676|    CHEMBL424|af7060a9-986c-500...|[EFO_0000676, OTA...|
| HP_0002148|CHEMBL1200925|475f5cd7-45bd-412...|[HP_0002148, EFO_...|
| HP_0000989|CHEMBL1201089|6f68cbbb-6269-4bb...|[HP_0000989, HP_0...|
+-----------+-------------+--------------------+--------------------+
only showing top 5 rows



In [19]:
# Grouping by drug and reference. Is the mapping related? -->
# --> Is the curated mapping present in any of the ancestors in Tim's annotation

enrichment = (
    dailymed.withColumnRenamed('id', 'chembl_id')
        .join(
        tim_w_ancestors.withColumnRenamed('efo_id', 'new_efo_id'),
        on = ['dailymed_id', 'chembl_id'],
        how='left')
        .distinct()
        .withColumn('sameMapping', F.when(F.col('efo_id') == F.col('new_efo_id'), True).otherwise(False))
        .withColumn('isRelated', F.array_contains(F.col('expanded_efo_id'), F.col('efo_id')))
        .withColumn('isRelated', F.when(F.col('isRelated').isNull(), False).otherwise(F.col('isRelated')))
)

enrichment.show()

+--------------------+-------------+-----------+-----------+--------------------+-----------+---------+
|         dailymed_id|    chembl_id|     efo_id| new_efo_id|     expanded_efo_id|sameMapping|isRelated|
+--------------------+-------------+-----------+-----------+--------------------+-----------+---------+
|e6d0a19a-8d4e-4c5...|   CHEMBL1464|EFO_0000275|       null|                null|      false|    false|
|bf1dd603-d91e-03d...|    CHEMBL911|EFO_0004698|       null|                null|      false|    false|
|d58e820b-bfd2-4e6...|   CHEMBL1643|EFO_0004220|       null|                null|      false|    false|
|d62d754e-2310-491...|CHEMBL1201863| HP_0100633|       null|                null|      false|    false|
|000dc81d-ab91-450...|   CHEMBL1069|EFO_0003144|EFO_0003144|[EFO_0003144, MON...|       true|     true|
|2ebee624-71ee-4eb...|    CHEMBL435|EFO_0000537|       null|                null|      false|    false|
|3302a75c-94d0-4e3...|    CHEMBL435|EFO_0000537|       null|    

In [20]:
enrichment.select('dailymed_id', 'chembl_id', 'efo_id', 'isRelated').distinct().groupBy('isRelated').count().show()

+---------+-----+
|isRelated|count|
+---------+-----+
|     true|  766|
|    false|24358|
+---------+-----+



In [21]:
# It is not so trivial to check the enrichment of only the current indications.
# The explosion is so big that the counts are not informative enough
# I'll break it into parts

sameMapping = (
    dailymed.withColumnRenamed('id', 'chembl_id')
        .join(
        tim_w_ancestors,
        on = ['dailymed_id', 'chembl_id', 'efo_id'],
        how='inner')
        .distinct()
)

diffMapping = (
    dailymed.withColumnRenamed('id', 'chembl_id')
        .join(
        tim_w_ancestors,
        on = ['dailymed_id', 'chembl_id', 'efo_id'],
        how='left_anti')
        .distinct()
)

print('Annotation with different mapping', diffMapping.count())
print('Annotation with same mapping', sameMapping.count())

Annotation with different mapping 24031
Annotation with same mapping 689


In [22]:
# A record equals to a reference/drug/indication relationship
dailymed.select('dailymed_id', F.col('id').alias('chembl_id'), 'efo_id').distinct().join(tim.select('dailymed_id', 'chembl_id', 'efo_id').distinct(), on=['dailymed_id', 'chembl_id', 'efo_id'], how='left_anti').distinct().count()

24031

In [23]:
sameMapping

DataFrame[dailymed_id: string, chembl_id: string, efo_id: string, expanded_efo_id: array<string>]

In [24]:
diffMapping.show()

+--------------------+-------------+-----------+
|         dailymed_id|    chembl_id|     efo_id|
+--------------------+-------------+-----------+
|da982c79-4f29-467...|    CHEMBL480|EFO_0003948|
|5d080267-a1ad-471...|   CHEMBL1082|EFO_0000544|
|b03d1cc5-6142-474...|   CHEMBL1082|EFO_0004607|
|ac453fa1-5b39-40e...|    CHEMBL112|EFO_0003843|
|8c1edb39-1ae9-44f...|    CHEMBL113|EFO_0003821|
|70e115cc-6e9b-4b9...|CHEMBL1201082|EFO_0003761|
|16348d76-055f-4a7...|   CHEMBL1708|EFO_0003761|
|d1b9dd40-8140-40d...|   CHEMBL1708|EFO_0003761|
|a1de8bba-3b1d-4c9...|   CHEMBL1773|EFO_0000305|
|fc29adaa-48f1-40d...| CHEMBL389621|EFO_0000270|
|4b37f277-4319-455...|    CHEMBL435|EFO_0000537|
|4ea10b58-88d0-4d7...|    CHEMBL435|EFO_0000537|
|caaf1b7f-eaab-4fb...|    CHEMBL137|EFO_1000760|
|4e1f4eb4-19b2-eee...|CHEMBL1201129|EFO_0003811|
|c155d6ba-9a16-406...|CHEMBL1201129|EFO_1001779|
|d606a983-3146-4f5...|    CHEMBL787|EFO_0000270|
|6fd7131e-7ac4-42f...|   CHEMBL1175|EFO_0005687|
|4616f5fa-8e43-4a4..

In [25]:
(
    diffMapping
        .join(
        tim_w_ancestors.withColumnRenamed('efo_id', 'new_efo_id'),
        on = ['dailymed_id', 'chembl_id'],
        how='left')
        .distinct()
        .filter(F.col('new_efo_id').isNotNull())
        .withColumn('isRelated', F.array_contains(F.col('expanded_efo_id'), F.col('efo_id')))
        .withColumn('isRelated', F.when(F.col('isRelated').isNull(), False).otherwise(F.col('isRelated')))
        #.select('dailymed_id', 'chembl_id', 'efo_id').distinct()
        .show()
)

+--------------------+-------------+-------------+-------------+--------------------+---------+
|         dailymed_id|    chembl_id|       efo_id|   new_efo_id|     expanded_efo_id|isRelated|
+--------------------+-------------+-------------+-------------+--------------------+---------+
|144067b2-0532-476...|    CHEMBL885|  EFO_0000764|  EFO_0000180|[EFO_0000180, EFO...|     true|
|80e736d3-2017-4d6...|   CHEMBL1430| Orphanet_905|MONDO_0003937|[MONDO_0003937, O...|    false|
|89595dbd-2722-41f...|     CHEMBL85|  EFO_0003015|  EFO_0003758|[EFO_0003758, MON...|    false|
|00088ecd-e45d-408...|CHEMBL3184512|  EFO_0008588|  EFO_0003102|[EFO_0003102, EFO...|    false|
|457a026c-ebce-470...|    CHEMBL730|  EFO_0000537|  EFO_0003144|[EFO_0003144, MON...|    false|
|deb4a13c-855b-437...|CHEMBL1201129|  EFO_0003811|  EFO_0000198|[EFO_0000198, OTA...|    false|
|4ea10b58-88d0-4d7...|    CHEMBL435|  EFO_0000537|  EFO_0000373|[EFO_0000373, MON...|    false|
|0fd36cb9-c4f6-416...|CHEMBL1201566|  EF

In [26]:
# Key thing here: when Tim annotates the same drug in a reference, most likely the EFO ID is the same.

# Question: What is the reason for annotating a different drug?

In [27]:
tim.joi

AttributeError: 'DataFrame' object has no attribute 'joi'

In [None]:
tim_w_ancestors.select('chembl_id', 'dailymed_id', 'efo_id').distinct().join(dailymed.withColumnRenamed('id', 'chembl_id').select('chembl_id', 'dailymed_id', 'efo_id').distinct(), on=['chembl_id', 'dailymed_id', 'efo_id'], how='left_anti').count()

7780

In [None]:
dailymed.filter(F.col('dailymed_id').contains('fe3e2db6-1aee-4767-bc1f')).show()

+-------------+-----------+--------------------+
|           id|     efo_id|         dailymed_id|
+-------------+-----------+--------------------+
|CHEMBL1200328|EFO_0003761|fe3e2db6-1aee-476...|
|   CHEMBL1175|EFO_0003761|fe3e2db6-1aee-476...|
+-------------+-----------+--------------------+



In [None]:
tim_w_ancestors.filter(F.col('dailymed_id').contains('fe3e2db6-1aee-4767-bc1f')).show()

+------+---------+-----------+---------------+
|efo_id|chembl_id|dailymed_id|expanded_efo_id|
+------+---------+-----------+---------------+
+------+---------+-----------+---------------+



In [None]:
# Finding: it is mainly because Tim only annotates one drug per reference, whereas the manual curation gets more chembl ids.

In [None]:
dailymed.select('id').distinct().count()

1170