In [2]:
import pandas as pd
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = (SparkSession.builder
    .master('local[*]')
    .config("spark.driver.memory", "15g")
    .appName('spark')
    .getOrCreate())

## 1. Inspect data

In [78]:
adverseTargetReactions = spark.read.parquet('/Users/irene/Documents/dev/datasets/openfda/adverseTargetReactions')

significantAdverseTargetReactions = spark.read.parquet('/Users/irene/Documents/dev/datasets/openfda/significantAdverseTargetReactions')

In [4]:
print('Total count:', adverseTargetReactions.count())
print('Significant count:', significantAdverseTargetReactions.count())

Total count: 1940905
Significant count: 25193


In [90]:
significantAdverseTargetReactions.select('targetId').distinct().count()

868

In [10]:
print(significantAdverseTargetReactions.first())

Row(chembl_id='CHEMBL3833368', llr=150.8579735206149, critval=82.77017808906385, event='abdominal discomfort', targetId='ENSG00000041880', count=217, meddraCode='10000059')


In [15]:
adverseTargetReactions.filter(F.col('chembl_id') == 'CHEMBL3833368').filter(F.col('targetId') == 'ENSG00000041880').filter(F.col('meddraCode') == '10000059').first()

Row(chembl_id='CHEMBL3833368', reaction_reactionmeddrapt='abdominal discomfort', uniq_report_ids_by_target=7069, targetId='ENSG00000041880', A=217, uniq_report_ids_by_reaction=31060, C=6852, B=30843, D=4283942, aterm=-1077.1399872662014, cterm=-44124.73187004238, acterm=-45352.729830829194, llr=150.8579735206149, meddraCode='10000059')

### 1.1. Is the significant a subset of the total? --> Not exactly

In [23]:
# These are the unique fields: 'event', 'chembl_id', 'targetId'

print(adverseTargetReactions.select('reaction_reactionmeddrapt', 'chembl_id', 'targetId').distinct().count())

(
    # Both datasets stripped to its essential fields are concatenated
    adverseTargetReactions.withColumnRenamed('reaction_reactionmeddrapt', 'event').select('event', 'chembl_id', 'targetId').distinct()
    .union(significantAdverseTargetReactions.select('event', 'chembl_id', 'targetId').distinct())
    # After dropping duplicates, I should end up with the adverseTargetReactions count
    .distinct()
    .count()
)


1940905


1941108

In [24]:
significantAdverseTargetReactions.select('event', 'chembl_id', 'targetId').join(
    adverseTargetReactions.withColumnRenamed('reaction_reactionmeddrapt', 'event').select('event', 'chembl_id', 'targetId').distinct(),
    on=['event', 'chembl_id', 'targetId'], how='inner'
).count()

24990

In [89]:
# --> How is it possible that there are event/target pairs in the significant set which are not present in the whole one?

significantAdverseTargetReactions.select('event', 'chembl_id', 'targetId', 'llr', 'critval').join(
    adverseTargetReactions.withColumnRenamed('reaction_reactionmeddrapt', 'event').select('event', 'chembl_id', 'targetId').distinct(),
    on=['event', 'chembl_id', 'targetId'], how='left_anti'
).show()

+--------------------+-------------+---------------+------------------+------------------+
|               event|    chembl_id|       targetId|               llr|           critval|
+--------------------+-------------+---------------+------------------+------------------+
| acute kidney injury| CHEMBL289480|ENSG00000010322|195.84136964049594| 8.493369839319712|
|            delirium| CHEMBL289480|ENSG00000010322| 15.25061893917973| 8.493369839319712|
|             prurigo| CHEMBL289480|ENSG00000010322|12.377978259030897| 8.493369839319712|
|             syncope|  CHEMBL19236|ENSG00000010322|29.380202199034102| 8.493369839319712|
|general physical ...|  CHEMBL19236|ENSG00000010322|22.873389389446857| 8.493369839319712|
| penile size reduced|CHEMBL2108709|ENSG00000204291|43.160464450156724|10.001785956366072|
|wound infection p...|CHEMBL2108709|ENSG00000204291| 15.25582584750191|10.001785956366072|
|cardiovascular in...| CHEMBL289480|ENSG00000010322|16.934263449239552| 8.493369839319712|

In [86]:
adverseTargetReactions.filter(F.col('reaction_reactionmeddrapt') == 'acute kidney injury').filter(F.col('targetId') == 'ENSG00000010322').count()

0

### 1.1.2. Is this also the case for the analysis on the drugs? Nope!

In [91]:
adverseDrugReactions = spark.read.parquet('/Users/irene/Documents/dev/datasets/openfda/adverseDrugReactions')

significantAdverseDrugReactions = spark.read.parquet('/Users/irene/Documents/dev/datasets/openfda/significantAdverseDrugReactions')

In [92]:
print(adverseDrugReactions.first())
print(significantAdverseDrugReactions.first())

Row(chembl_id='CHEMBL117785', reaction_reactionmeddrapt='abasia', uniq_report_ids_by_reaction=6696, uniq_report_ids_by_drug=1452, A=3, C=1449, B=6693, D=4750650, aterm=-23.131959970503605, cterm=-11730.333176658552, acterm=-11753.660994590573, llr=0.19585796151659451, meddraCode='10049460')
Row(chembl_id='CHEMBL1148', event='acute kidney injury', count=190, llr=219.10278395369824, critval=31.822831907447835, meddraCode='10069339')


In [93]:
significantAdverseDrugReactions.select('event', 'chembl_id', 'llr', 'critval').join(
    adverseDrugReactions.withColumnRenamed('reaction_reactionmeddrapt', 'event').select('event', 'chembl_id').distinct(),
    on=['event', 'chembl_id'], how='left_anti'
).show()

+-----+---------+---+-------+
|event|chembl_id|llr|critval|
+-----+---------+---+-------+
+-----+---------+---+-------+



### 1.2. How is the target inferred? --> Via the MoA. Does it strictly use drug's linkedTargets? Yes

In [22]:
print(significantAdverseTargetReactions.first())

Row(chembl_id='CHEMBL3833368', llr=150.8579735206149, critval=82.77017808906385, event='abdominal discomfort', targetId='ENSG00000041880', count=217, meddraCode='10000059')


In [36]:
drug_idx = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/21.09.5/molecule')
    .select(F.col('id').alias('chembl_id'), F.col('linkedTargets.rows').alias('targets'))
)
drug_idx.filter(F.col('chembl_id') == 'CHEMBL3833368').show(truncate=False)

+-------------+---------------------------------------------------+
|chembl_id    |targets                                            |
+-------------+---------------------------------------------------+
|CHEMBL3833368|[ENSG00000143799, ENSG00000129484, ENSG00000041880]|
+-------------+---------------------------------------------------+



### 1.1.1. Drugs with no associated target. Do we have AEs? --> No

In [99]:
drugs_without_target = drug_idx.filter(F.col('targets').isNull()).select('chembl_id').distinct()
drugs_without_target.show(5)

+-------------+
|    chembl_id|
+-------------+
| CHEMBL110739|
|   CHEMBL1195|
|CHEMBL1200632|
|CHEMBL1200691|
|CHEMBL1201042|
+-------------+
only showing top 5 rows



In [None]:
adverseTargetReactions.join(drugs_without_target, on='chembl_id', how='inner').count()

In [100]:
print(adverseTargetReactions.join(drugs_without_target, on='chembl_id', how='inner').count())
print(significantAdverseTargetReactions.join(drugs_without_target, on='chembl_id', how='inner').count())

0
0


### 1.1.2. Drugs with 1 associated target. Do we have AEs with only 1 target? --> Yes

In [101]:
drugs_one_target = drug_idx.filter(F.size('targets') == 1).select('chembl_id').distinct()
drugs_one_target.show(5)

+-------------+
|    chembl_id|
+-------------+
|CHEMBL1743017|
|CHEMBL2105675|
|CHEMBL2107826|
|CHEMBL2346976|
|CHEMBL3545005|
+-------------+
only showing top 5 rows



In [105]:
(
    adverseTargetReactions.join(drugs_one_target, on='chembl_id', how='inner')
    .groupby('chembl_id').agg(F.collect_set('targetId').alias('targets'))
    .withColumn('size', F.size('targets')).orderBy(F.col('size').desc())
    .first()
)

Row(chembl_id='CHEMBL1201294', targets=['ENSG00000112038'], size=1)

### 1.1.3. Drugs with multiple associated targets. Do we have AEs with multiple targets? --> Yes, but numbers can discern.

In [106]:
drug_idx.filter(F.size('targets') > 1).show(3, truncate=False)

+----------+----------------------------------+----+
|chembl_id |targets                           |size|
+----------+----------------------------------+----+
|CHEMBL1228|[ENSG00000073756, ENSG00000095303]|2   |
|CHEMBL1297|[ENSG00000073756, ENSG00000095303]|2   |
|CHEMBL1671|[ENSG00000169252, ENSG00000043591]|2   |
+----------+----------------------------------+----+
only showing top 3 rows



In [61]:
print(adverseTargetReactions.filter(F.col('chembl_id') == 'CHEMBL1738797').groupby('targetId').count().show())
print(significantAdverseTargetReactions.filter(F.col('chembl_id') == 'CHEMBL1738797').groupby('targetId').count().show())

+---------------+-----+
|       targetId|count|
+---------------+-----+
|ENSG00000143924|  256|
|ENSG00000165731|  123|
|ENSG00000171094|  246|
+---------------+-----+

None
+--------+-----+
|targetId|count|
+--------+-----+
+--------+-----+

None


In [111]:
# Extract maximum number of targets for both datasets

drug_idx = drug_idx.withColumn('size', F.size('targets'))

adverseTargetReactions_2 = (
    adverseTargetReactions.groupby('chembl_id').agg(F.collect_set('targetId').alias('targets'))
    .withColumn('size', F.size('targets'))
)

adverseTargetReactions_2.show(1, False, True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 chembl_id | CHEMBL1637                                                                                                                                                                                  
 targets   | [ENSG00000177084, ENSG00000101868, ENSG00000146143, ENSG00000062822, ENSG00000167325, ENSG00000148229, ENSG00000198056, ENSG00000077514, ENSG00000106628, ENSG00000014138, ENSG00000175482] 
 size      | 11                                                                                                                                                                                          
only showing top 1 row



In [67]:
adverseTargetReactions_2.join(drug_idx, on=['chembl_id', 'size'], how='left_anti').show(5, False, True)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 chembl_id | CHEMBL1637                                                                                                                                                                                  
 size      | 11                                                                                                                                                                                          
 targets   | [ENSG00000177084, ENSG00000101868, ENSG00000146143, ENSG00000062822, ENSG00000167325, ENSG00000148229, ENSG00000198056, ENSG00000077514, ENSG00000106628, ENSG00000014138, ENSG00000175482] 
-RECORD 1---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [112]:
print('Sometimes there are less targets in the FAERS analysis than in the drugs set (1 instead of 14)?:')
print(drug_idx.filter(F.col('chembl_id') == 'CHEMBL3622533').show())
print(adverseTargetReactions_2.filter(F.col('chembl_id') == 'CHEMBL3622533').show())

Sometimes there are less targets in the FAERS analysis than in the drugs set (1 instead of 14)?:
+-------------+--------------------+----+
|    chembl_id|             targets|size|
+-------------+--------------------+----+
|CHEMBL3622533|[ENSG00000094631,...|  14|
+-------------+--------------------+----+

None
+-------------+-----------------+----+
|    chembl_id|          targets|size|
+-------------+-----------------+----+
|CHEMBL3622533|[ENSG00000051382]|   1|
+-------------+-----------------+----+

None
