In [1]:
import os

from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.12.0 pyspark-shell'

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [4]:
aop_xml = 'data/dump/aop-wiki-xml-2022-01-01.gz'

entities = [
    "chemical",
      "biological-object",
      "biological-process",
      "biological-action",
      "stressor",
      "taxonomy",
      "key-event",
      "key-event-relationship",
      "aop",
      "vendor-specific"
]

In [5]:
chemical_df, bio_object_df, bio_process_df, bio_action_df, stressor_df, taxonomy_df, ke_df, ker_df, aop_df, vendor_specific_df = [
    spark.read.format('xml').option('rootTag', 'data').option('rowTag', entity).load(aop_xml)
    for entity in entities
]

## Extract Taxonomic applicability per AOP

In `vendor_specific_df` you can find the IDs with the relationships between entities.

In [6]:
def getIdLookup(field: str) -> DataFrame:
    """Returns a dataframe where the ID of the AOP is referenced to the hash of a different node."""

    fields = [
        "aop",
        "biological-action",
        "biological-object",
        "biological-process",
        "chemical",
        "key-event",
        "key-event-relationship",
        "stressor",
        "taxonomy",
        "taxonomy-reference"
    ]
    
    assert field in fields, f'{field} source field is not valid. Impossible to get ID.'

    return (
        vendor_specific_df.withColumn('r', F.explode(field))
        .select(F.col("r._id").alias('xrefId'), F.col("r._aop-wiki-id").alias('aopId'))
    )

In [8]:
taxonomyRefLookup = getIdLookup('taxonomy-reference')

taxonomyRefLookup.show()

+--------------------+------+
|              xrefId| aopId|
+--------------------+------+
|3cf5bab0-bfa0-48d...|    69|
|64d2435b-11f6-4bc...|   459|
|53fc5b57-1854-427...|    74|
|5ba6e1b4-6cc0-432...|    31|
|7790ac41-b325-4ac...|    68|
|78330d72-718b-475...|  4578|
|5fe4e410-4fe9-49b...|   656|
|a08c5500-86f9-4bc...|720890|
|8c8543e6-1276-412...|   525|
|42be7f04-cce8-4e2...|     1|
|61c52f46-66ce-416...|    30|
|479c8a10-07aa-452...|   623|
|737479c1-185c-4e0...|    71|
|cb93063e-3148-4b3...|  2965|
|eb0c0f76-2899-4db...|  2971|
|3fbffa75-bd1e-451...|   646|
|8aca03a4-1b8f-4a0...|    66|
|85f5eeb8-57e2-44e...|   515|
|e816787c-ec48-490...|  5084|
|9b76e3b8-1044-496...|    45|
+--------------------+------+
only showing top 20 rows



In [37]:
aop_w_applicability = (
    taxonomyRefLookup
    .join(
        taxonomy_df.select('_id', F.col('name').alias('applicability')),
        taxonomy_df['_id'] == taxonomyRefLookup['xrefId'],
        how='inner'
    )
    .drop('_id', 'xrefId')
    .distinct()
)

aop_w_applicability.show(5)

+------+--------------------+
| aopId|       applicability|
+------+--------------------+
|  8935|  Ostrinia nubilalis|
|142643|Xenopus (Silurana...|
|   611| Meleagris gallopavo|
|   658|   Daphnia pulicaria|
|123451|  Liriomyza trifolii|
+------+--------------------+
only showing top 5 rows



In [22]:
aop_w_applicability.filter(F.col('aopId') == '69').show()

+-----+-------------+
|aopId|applicability|
+-----+-------------+
|   69|         rats|
+-----+-------------+



In [30]:
# It is impossible that no species are repeated...

aop_w_applicability.groupBy('applicability').count().orderBy(F.col('count').desc()).show(truncate=False)

+--------------------------------------+-----+
|applicability                         |count|
+--------------------------------------+-----+
|Mustela putorius furo                 |1    |
|Common Starling                       |1    |
|goldfish                              |1    |
|Sprague-Dawley                        |1    |
|Argopecten irradians                  |1    |
|chickens                              |1    |
|Dicentrarchus labrax                  |1    |
|human, mouse, rat                     |1    |
|Acipenser fulvescens                  |1    |
|Xenopus (Silurana) n. sp. tetraploid-1|1    |
|Anopheles gambiae                     |1    |
|Gammarus pulex                        |1    |
|Homo sapiens                          |1    |
|nematode                              |1    |
|Radopholus similis                    |1    |
|Lacerta vivipara                      |1    |
|eisenia fetida                        |1    |
|tritonea diomedea                     |1    |
|gilthead bre

In [38]:
# Let's clean the string a little bit

aop_w_applicability = (
    aop_w_applicability
    .withColumn('applicability', F.explode(F.split('applicability', ', ')))
)


In [39]:
aop_annotation = (
    spark.read.json('outputs/AOPWiki_reformatted_220302')
    .join(aop_w_applicability, on='aopId', how='left')
)

aop_annotation.first()

Row(aopId=1, aoId=334, aoMappedId=None, aoName='Promotion Hepatocelluar carcinoma', aoNameRaw='Promotion, Hepatocelluar carcinoma', aopName='Uncharacterized liver damage leading to hepatocellular carcinoma', keyEvents=[Row(keId=294, keName='N/A, Unknown'), Row(keId=57, keName='Proliferation, Cell proliferation in the absence of cytotoxicity'), Row(keId=334, keName='Promotion, Hepatocelluar carcinoma'), Row(keId=142, keName='Hyperplasia, Hyperplasia')], reference='https://aopwiki.org/aops/1', status=None, targetId=None, targetName=None, applicability='Homo sapiens')

In [40]:
aop_annotation.toPandas().head()

Unnamed: 0,aopId,aoId,aoMappedId,aoName,aoNameRaw,aopName,keyEvents,reference,status,targetId,targetName,applicability
0,26,356,,Increased Oxidative damage,"Increased, Oxidative damage",Calcium-mediated neuronal ROS production and e...,"[(40, Decrease, Mitochondrial ATP production),...",https://aopwiki.org/aops/26,,,,
1,29,363,,Altered Reproductive behaviour,"Altered, Reproductive behaviour",Estrogen receptor agonism leading to reproduct...,"[(111, Agonism, Estrogen receptor), (252, Incr...",https://aopwiki.org/aops/29,,ENSG00000091831,estrogen receptor,
2,29,363,,Altered Reproductive behaviour,"Altered, Reproductive behaviour",Estrogen receptor agonism leading to reproduct...,"[(111, Agonism, Estrogen receptor), (252, Incr...",https://aopwiki.org/aops/29,,,,
3,29,339,,Altered Larval development,"Altered, Larval development",Estrogen receptor agonism leading to reproduct...,"[(111, Agonism, Estrogen receptor), (252, Incr...",https://aopwiki.org/aops/29,,ENSG00000091831,estrogen receptor,
4,29,339,,Altered Larval development,"Altered, Larval development",Estrogen receptor agonism leading to reproduct...,"[(111, Agonism, Estrogen receptor), (252, Incr...",https://aopwiki.org/aops/29,,,,


In [42]:
aop_annotation.coalesce(1).write.json('outputs/AOPWiki_reformatted_enriched_220302')

In [43]:
aop_annotation.show(1, False, True)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 aopId         | 1                                                                                                                                                                         
 aoId          | 334                                                                                                                                                                       
 aoMappedId    | null                                                                                                                                                                      
 aoName        | Promotion Hepatocelluar carcinoma                                                                                                                                         
 aoNameRaw     | Promotion, Hepatocelluar carcinoma         