In [2]:
import pandas as pd
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

## Generate table with main pieces of information from AOPWiki

To include:
- AOP id
- Adverse Outcome
- Mapped Adverse Outcome to EFO
- Target Name
- Mapped target to ENSEMBL
- Key Event title
- OECD Status
- Species applicability
- Reference

In [17]:
## Load datasets generated from the AOPWiki XML

aop_df = spark.read.json('data/parsed/aopV1')
ke_df = spark.read.json('data/parsed/kevV1').drop('references')

### 1. AOP id + Reference + status

In [5]:
aop.printSchema()

root
 |-- id: long (nullable = true)
 |-- keyEvents: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- status: string (nullable = true)
 |-- stressors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- chemicalId: long (nullable = true)
 |    |    |-- inchiKey: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- preferredName: string (nullable = true)
 |    |    |-- qualityAssurance: string (nullable = true)
 |    |    |-- synonyms: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)



In [18]:
aop_df = (aop_df
       .select(F.col('id').alias('aopId'), 'keyEvents', 'status', F.col('title').alias('aopName'))
       .withColumn('reference', F.concat(F.lit('https://aopwiki.org/aops/'), F.col('aopId')))
       )

aop_df.show(5)

+-----+--------------------+------+--------------------+--------------------+
|aopId|           keyEvents|status|             aopName|           reference|
+-----+--------------------+------+--------------------+--------------------+
|  177|[{1104, key-event...|  null|Cyclooxygenase 1 ...|https://aopwiki.o...|
|  398|[{1880, molecular...|  null|Inhibition of ALD...|https://aopwiki.o...|
|  242|[{1466, key-event...|  null|Inhibition of lys...|https://aopwiki.o...|
|  241|[{68, key-event},...|  null|Latent Transformi...|https://aopwiki.o...|
|  260|[{1513, key-event...|  null|CYP2E1 activation...|https://aopwiki.o...|
+-----+--------------------+------+--------------------+--------------------+
only showing top 5 rows



### 2. Adverse Outcome

In [37]:
aop_w_ao_df = (
    # Explode all KE listed in an AOP and filter only the AO ones
    aop_df.withColumn('keyEvent', F.explode('keyEvents'))
    .filter(F.col('keyEvent.type') == 'adverse-outcome')

    .withColumn('aoId', F.col('keyEvent.id'))

    # Join with KE dataset to extract the title of the KE
    .join(
        ke_df.select(F.col('id').alias('aoId'), F.col('title').alias('ao')),
        on='aoId', how='left'
    )
    .drop('keyEvent')
    .distinct()
)

aop_w_ao_df.show(5)

+----+-----+--------------------+-----------------+--------------------+--------------------+--------------------+
|aoId|aopId|           keyEvents|           status|             aopName|           reference|                  ao|
+----+-----+--------------------+-----------------+--------------------+--------------------+--------------------+
| 360|  387|[{1479, key-event...|             null|Increased reactiv...|https://aopwiki.o...|Decrease, Populat...|
| 568|   84|[{403, key-event}...|             null|Suppression of im...|https://aopwiki.o...|impaired, Hive th...|
| 344|   38|[{265, key-event}...|TFHA/WNT Endorsed|Protein Alkylatio...|https://aopwiki.o...| N/A, Liver fibrosis|
| 351|  160|[{1016, key-event...|             null|Ionotropic gamma-...|https://aopwiki.o...| Increased Mortality|
| 360|  216|[{1444, key-event...|             null|Excessive reactiv...|https://aopwiki.o...|Decrease, Populat...|
+----+-----+--------------------+-----------------+--------------------+--------

### 3. Target name

In [22]:
# Extracted from KE where the biological event is a Protein Ontology (PRO) ID
ke_df.withColumn('be', F.explode('biologicalEvents')).filter(F.col('be.object.source') == 'PR').first()

Row(biologicalEvents=[Row(action='decreased', object=Row(name='NADH-ubiquinone oxidoreductase chain 1', source='PR', sourceId='PR:000031316'), process='NADH dehydrogenase (ubiquinone) activity')], biologicalOrganisationLevel='Cellular', id=887, keyEventStressors=["1',2'-dihydrorotenone"], organTerm=Row(name=None, sourceId=None), title='Inhibition, NADH-ubiquinone oxidoreductase  (complex I)', be=Row(action='decreased', object=Row(name='NADH-ubiquinone oxidoreductase chain 1', source='PR', sourceId='PR:000031316'), process='NADH dehydrogenase (ubiquinone) activity'))

In [38]:
targets = (
    ke_df.withColumn('be', F.explode('biologicalEvents'))
    .filter(F.col('be.object.source') == 'PR')
    .withColumn('targetName', F.col('be.object.name'))
    .select(F.col('id').alias('keId'), 'targetName').distinct()
)

targets.show(5)

+----+--------------------+
|keId|          targetName|
+----+--------------------+
|  27|   androgen receptor|
|  79|prostaglandin G/H...|
| 973|          AKT kinase|
| 980|FK506-binding pro...|
| 305|vascular endothel...|
+----+--------------------+
only showing top 5 rows



In [39]:
# Merge the target data with the wip dataset

aop_w_ao_df = (
    aop_w_ao_df

    # First I extract all KE IDs per AOP to use it as a key between the aop_w_ao and target datasets
    .withColumn('keId', F.explode('keyEvents'))
    .withColumn('keId', F.col('keId.id'))

    # I can now join it with the targets dataset
    # Inner join: only interested in AOPs for which there is a target
    .join(targets, on='keId', how='left').drop('keId')
    .distinct()
)

aop_w_ao_df.show(5, False, True)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------
 aoId       | 360                                                                                                                                                             
 aopId      | 387                                                                                                                                                             
 keyEvents  | [{1479, key-event}, {257, molecular-initiating-event}, {1863, adverse-outcome}, {1770, key-event}, {1477, key-event}, {360, adverse-outcome}, {40, key-event}]  
 status     | null                                                                                                                                                            
 aopName    | Increased reactive oxygen species production leading to population decline via mitochondrial dysfunction       

### 4. Get names for all KE in an AOP (previously it was filtered to only keep KE of type adverse outcome)

In [42]:
ke_names = (

    aop_w_ao_df.select('keyEvents', 'aopId').distinct()
    # Explode KEs again to have all KEs, not just the ones with a target
    .withColumn('keId', F.explode('keyEvents.id'))

    # Bring name of the KE
    .join(
        ke_df.select(F.col('id').alias('keId'), F.col('title').alias('keName')),
        on='keId', how='left')

    # Group KE in a struct
    .withColumn('keyEvents', F.struct('keId', 'keName'))
    # The KE dataset has a lot of duplication with NaN values - I'll remove them
    .filter(F.col('keyEvents.keName').isNotNull())
    # TODO: Filter out KE describing AO

    .groupby('aopId')
    .agg(F.collect_set('keyEvents').alias('keyEvents'))

)

ke_names.first()

Row(aopId=1, keyEvents=[Row(keId=294, keName='N/A, Unknown'), Row(keId=57, keName='Proliferation, Cell proliferation in the absence of cytotoxicity'), Row(keId=334, keName='Promotion, Hepatocelluar carcinoma'), Row(keId=142, keName='Hyperplasia, Hyperplasia')])

In [47]:
aop_w_ao_df = (
    aop_w_ao_df.drop('keyEvents')
    .join(ke_names, on='aopId', how='inner')
    .distinct()
)

aop_w_ao_df.show(2, False, True)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 aopId      | 1                                                                                                                                                                                                                                                                                                                                                         
 aoId       | 334                                                                                                                                                                                                                                                                     

### 5. Species -- hasn't been extracted

### 6. Map target to Ensembl

In [49]:
target_idx = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/22.02.4/targets')
    .select('id', F.array('approvedSymbol').alias('approvedSymbol'),
            F.array('approvedName').alias('approvedName'),
            F.col('synonyms.label').alias('synonyms'), F.col('symbolSynonyms.label').alias('symbolSynonyms'),
            F.col('nameSynonyms.label').alias('nameSynonyms'))
    .withColumn('namesCombined', F.array_distinct(F.flatten(F.array('approvedSymbol', 'approvedName', 'synonyms', 'symbolSynonyms', 'nameSynonyms'))))
    .select(F.col('id').alias('targetId'), 'namesCombined')
)

target_idx.first()

Row(targetId='ENSG00000002586', namesCombined=['CD99', 'CD99 molecule (Xg blood group)', 'CD99 antigen', 'MIC2', 'MIC2X', 'MIC2Y', '12E7', 'E2 antigen', 'Protein MIC2', 'T-cell surface glycoprotein E2', 'HBA71', 'MSK5X', 'MIC2 (monoclonal antibody 12E7)', 'antigen identified by monoclonal 12E7, Y homolog', 'antigen identified by monoclonal antibodies 12E7, F21 and O13', 'cell surface antigen 12E7', 'cell surface antigen HBA-71', 'cell surface antigen O13', 'surface antigen MIC2'])

In [122]:
aop_enriched = (
    aop_w_ao_df.join(
        target_idx.withColumn('targetName', F.explode('namesCombined')).drop('namesCombined'),
        on='targetName', how='left'
    )
    .distinct()
)

aop_enriched.show(3)

+--------------------+-----+----+-----------------+--------------------+--------------------+--------------------+--------------------+---------------+
|          targetName|aopId|aoId|           status|             aopName|           reference|                  ao|           keyEvents|       targetId|
+--------------------+-----+----+-----------------+--------------------+--------------------+--------------------+--------------------+---------------+
|                null|    1| 334|             null|Uncharacterized l...|https://aopwiki.o...|Promotion, Hepato...|[{294, N/A, Unkno...|           null|
|NADH-ubiquinone o...|    3| 896|TFHA/WNT Endorsed|Inhibition of the...|https://aopwiki.o...|Parkinsonian moto...|[{890, Degenerati...|ENSG00000198888|
|                null|    3| 896|TFHA/WNT Endorsed|Inhibition of the...|https://aopwiki.o...|Parkinsonian moto...|[{890, Degenerati...|           null|
+--------------------+-----+----+-----------------+--------------------+----------------

In [55]:
aop_enriched.filter(F.col('targetId').isNotNull()).select('targetName').distinct().count()

92

In [None]:
# 92 out of 139 targets have been mapped to Ensembl.

### 7. Map AO to EFO

Many terms follow the pattern "{X} leads to/leading to {Adverse Outcome}". I'll extract these cases

In [92]:
aop_enriched.select('ao').distinct().show(100, truncate=False)

+----------------------------------------------------------------------------------+
|ao                                                                                |
+----------------------------------------------------------------------------------+
|Increase, Increased susceptibility to infection                                   |
|Decline, Population trajectory                                                    |
|Decrease, Reproduction                                                            |
|Decrease, Fecundity (F3)                                                          |
|N/A, Liver Steatosis                                                              |
|N/A, Steatohepatisis                                                              |
|Increase, Mutations                                                               |
|impaired, Fertility                                                               |
|impaired, Hive thermoregulation                                 

In [123]:
aop_enriched = (
    aop_enriched
    .withColumn('aoNameRaw', F.col('ao'))
    .withColumn('aoName', F.regexp_replace(F.col('aoNameRaw'), 'N/A', ''))
    .withColumn('aoName', F.trim(F.regexp_replace(F.col('aoName'), ',', '')))
)

aos = (aop_enriched.select('aoName').distinct().toPandas()['aoName'].dropna().unique())
aos[:20]


array(['increase retained nipples in males', 'Altered Larval development',
       'Liver fibrosis', 'Increased Reproductive Success',
       'Formation Liver fibrosis',
       'Exacerbation of systemic lupus erythematosus (SLE)',
       'Infant leukaemia', 'increase Preeclampsia',
       'Impairment of reproductive capacity',
       'Occurrence Epileptic seizure', 'Increased adenomas (pituitary)',
       'Increased agitation', 'Increase Leydig cell tumors',
       'Increase Chromosomal aberrations', 'Necrosis',
       'Increase lung cancer',
       'Impairedanterior-posterior axis development',
       'Occurrence Kidney toxicity', 'Decreased Population size',
       'obesity'], dtype=object)

In [56]:
from ontoma import OnToma

otmap = OnToma()

INFO     - ontoma.interface - Using EFO cache from /var/folders/54/2j7x_lqn343_d6hjm7mcv9rc0000gn/T/tmp_umk81e9.
INFO:ontoma.interface:Using EFO cache from /var/folders/54/2j7x_lqn343_d6hjm7mcv9rc0000gn/T/tmp_umk81e9.
INFO     - ontoma.interface - Loaded 20060 terms, 89072 xrefs, and 65125 synonyms from EFO cache.
INFO:ontoma.interface:Loaded 20060 terms, 89072 xrefs, and 65125 synonyms from EFO cache.


In [None]:
mappings = dict()

for event in aos:
    mapping = otmap.find_term(event)
    if len(mapping) > 0:
        mappings[event] = mapping[0].id_ot_schema

In [113]:
mappings

{'Liver fibrosis': 'HP_0001395',
 'Necrosis': 'EFO_0009426',
 'obesity': 'EFO_0001073',
 'Hepatotoxicity': 'EFO_0011052',
 'Chronic kidney disease': 'EFO_0003884',
 'Lung cancer': 'MONDO_0008903',
 'Heart failure': 'EFO_0003144',
 'Encephalitis': 'MONDO_0019956',
 'Neurodegeneration': 'HP_0002180',
 'Hypertension': 'EFO_0000537',
 'Liver Steatosis': 'HP_0001397',
 'Analgesia': 'EFO_1001484',
 'Decreased Body Weight': 'HP_0004325',
 'Testicular atrophy': 'HP_0000029',
 'Breast Cancer': 'MONDO_0007254',
 'Liver Cancer': 'MONDO_0002691',
 'Pulmonary fibrosis': 'EFO_0009448',
 'Bronchiolitis obliterans': 'EFO_0007183',
 'Testicular Cancer': 'MONDO_0003510'}

In [143]:
# Coverage is very low: 19 out of 98 AOs have been successfully mapped

In [114]:
aop_enriched.first()

Row(targetName=None, aopId=1, aoId=334, status=None, aopName='Uncharacterized liver damage leading to hepatocellular carcinoma', reference='https://aopwiki.org/aops/1', ao='Promotion, Hepatocelluar carcinoma', keyEvents=[Row(keId=294, keName='N/A, Unknown'), Row(keId=57, keName='Proliferation, Cell proliferation in the absence of cytotoxicity'), Row(keId=334, keName='Promotion, Hepatocelluar carcinoma'), Row(keId=142, keName='Hyperplasia, Hyperplasia')], targetId=None, aoNameCleaned='Promotion Hepatocelluar carcinoma')

In [124]:
# Add mappings to the table

aop_enriched = (
    aop_enriched.withColumn('aoMappedId', F.col('aoName'))
    .replace(to_replace=mappings, subset=['aoMappedId'])
    .withColumn('aoMappedId', F.when(F.col('aoMappedId').contains('_'), F.col('aoMappedId')))
)

aop_enriched.filter(F.col('aoMappedId').isNotNull()).show(2, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 targetName | null                                                                                                                                                                                                                                                                                                                                                                                      
 aopId      | 293                                                                                                                                                                                                     

### 8. Export resulting table

In [125]:
output = (
    aop_enriched
    .withColumn('species', F.lit(None))
    .select('aopId', 'aopName', 'targetName', 'targetId', 'aoId', 'aoName', 'aoNameRaw', 'aoMappedId', 'keyEvents', 'species', 'status', 'reference')
    .distinct()
)

output.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 aopId      | 1                                                                                                                                                                         
 aopName    | Uncharacterized liver damage leading to hepatocellular carcinoma                                                                                                          
 targetName | null                                                                                                                                                                      
 targetId   | null                                                                                                                                                                      
 aoId       | 334                                                          

In [126]:
output.coalesce(1).write.json('outputs/AOPWiki_reformatted_220302')