In [1]:
import os

from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.12.0 pyspark-shell'

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [2]:
aop_xml = 'data/aop-wiki-xml-2022-01-01.gz'

entities = [
    "chemical",
      "biological-object",
      "biological-process",
      "biological-action",
      "stressor",
      "taxonomy",
      "key-event",
      "key-event-relationship",
      "aop",
      "vendor-specific"
]

In [5]:
chemical_df, bio_object_df, bio_process_df, bio_action_df, stressor_df, taxonomy_df, ke_df, ker_df, aop_df, vendor_specific_df = [
    spark.read.format('xml').option('rootTag', 'data').option('rowTag', entity).load(aop_xml)
    for entity in entities
]

## Extract Taxonomic applicability per AOP

In `vendor_specific_df` you can find the IDs with the relationships between entities.

In [None]:
def getIdLookup(field: str) -> DataFrame:

    fields = [
        "aop",
        "biological-action",
        "biological-object",
        "biological-process",
        "chemical",
        "key-event",
        "key-event-relationship",
        "stressor",
        "taxonomy"
    ]
    
    assert field in fields, f'{field} source field is not valid. Impossible to get ID.'

    return (
        vendor_specific_df.withColumn('r', F.explode('taxonomy-reference'))
        .select(F.col("r._id").alias('xrefId'), F.col("r._aop-wiki-id").alias('aopId'))
    )

In [17]:
vendor_specific_df.withColumn('taxonomy-reference', F.explode('taxonomy-reference')).select('taxonomy-reference').printSchema()

root
 |-- taxonomy-reference: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _aop-wiki-id: long (nullable = true)
 |    |-- _id: string (nullable = true)



In [39]:
aop_taxId_df = (
    vendor_specific_df.withColumn('taxonomy-reference', F.explode('taxonomy-reference'))
    .select(
        F.col('taxonomy-reference._aop-wiki-id').alias('aop_id'),
        F.col('taxonomy-reference._id').alias('taxonomy_id')
    )
)

aop_taxId_df.show(5)

+------+--------------------+
|aop_id|         taxonomy_id|
+------+--------------------+
|    69|3cf5bab0-bfa0-48d...|
|   459|64d2435b-11f6-4bc...|
|    74|53fc5b57-1854-427...|
|    31|5ba6e1b4-6cc0-432...|
|    68|7790ac41-b325-4ac...|
+------+--------------------+
only showing top 5 rows



In [45]:
aop_df.select('_id', 'applicability', 'title').show()

+--------------------+--------------------+--------------------+
|                 _id|       applicability|               title|
+--------------------+--------------------+--------------------+
|5eaa9d1c-efe9-4c9...|              \n    |Uncharacterized l...|
|8a7d866d-48c6-43e...|<sex>\n        <e...|Inhibition of the...|
|76a5866b-7089-434...|<sex>\n        <e...|Ecdysone receptor...|
|aa7bf0d2-0de7-4e7...|              \n    |pentachlorophenol...|
|c1e91578-c1af-4b6...|<sex>\n        <e...|Antagonist bindin...|
|4f1b4b20-d047-451...|<sex>\n        <e...|Aromatase (Cyp19a...|
|51629aac-c1a0-468...|<life-stage>\n   ...|Upregulation of T...|
|1ec3b823-1247-49a...|<sex>\n        <e...|Binding to the pi...|
|3de93557-e254-49d...|              \n    |Percellome Toxico...|
|a795dec8-d225-49e...|<life-stage>\n   ...|Chronic binding o...|
|77f42aee-2f5b-497...|<sex>\n        <e...|Chronic binding o...|
|da128666-c1a2-46c...|              \n    |Glucocorticoid Re...|
|fe31538f-ed90-48c...|<se

In [23]:
taxonomy_df.filter(F.col('_id').contains('3cf5bab0-bfa0')).show()

+--------------------+------------+--------+----+------+---------+
|                 _id|_taxonomy-id|evidence|name|source|source-id|
+--------------------+------------+--------+----+------+---------+
|3cf5bab0-bfa0-48d...|        null|    null|rats|  NCBI|    10116|
+--------------------+------------+--------+----+------+---------+



In [40]:
aop_taxId_df = (
    aop_taxId_df
    .join(
        taxonomy_df.select('_id', 'name'),
        taxonomy_df['_id'] == aop_taxId_df['taxonomy_id'],
        how='inner'
    )
    .drop('_id', 'taxonomy_id')
    .distinct()
)

aop_taxId_df.show(5)

+------+--------------------+
|aop_id|                name|
+------+--------------------+
|  8935|  Ostrinia nubilalis|
|142643|Xenopus (Silurana...|
|   611| Meleagris gallopavo|
|   658|   Daphnia pulicaria|
|123451|  Liriomyza trifolii|
+------+--------------------+
only showing top 5 rows



In [41]:
aop_taxId_df.filter(F.col('aop_id') == '202').show()

+------+----+
|aop_id|name|
+------+----+
+------+----+



In [37]:
aop_taxId_df.groupBy('name').count().orderBy(F.col('count').desc()).show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|Mustela putorius ...|    1|
|            goldfish|    1|
|     Common Starling|    1|
|      Sprague-Dawley|    1|
|Argopecten irradians|    1|
|Dicentrarchus labrax|    1|
|            chickens|    1|
|Acipenser fulvescens|    1|
|   human, mouse, rat|    1|
|Xenopus (Silurana...|    1|
|      Gammarus pulex|    1|
|   Anopheles gambiae|    1|
|        Homo sapiens|    1|
|            nematode|    1|
|  Radopholus similis|    1|
|   tritonea diomedea|    1|
|    Lacerta vivipara|    1|
|      eisenia fetida|    1|
|      gilthead bream|    1|
|   Philosamia ricini|    1|
+--------------------+-----+
only showing top 20 rows



In [47]:
df = spark.read.json('outputs/AOPWIki_refined.json')

df.printSchema()

root
 |-- adverseOutcome: string (nullable = true)
 |-- adverseOutcomeId: long (nullable = true)
 |-- aopId: long (nullable = true)
 |-- aopName: string (nullable = true)
 |-- keyEvents: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- keyEventId: long (nullable = true)
 |    |    |-- keyEventName: string (nullable = true)
 |-- reference: string (nullable = true)
 |-- status: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- targetName: string (nullable = true)



In [49]:
df_enriched = (
    df.join(aop_taxId_df, aop_taxId_df['aop_id'] == df['aopId'])
)

df_enriched.show()

+--------------------+----------------+-----+--------------------+--------------------+--------------------+-------------------+---------------+--------------------+------+--------------------+
|      adverseOutcome|adverseOutcomeId|aopId|             aopName|           keyEvents|           reference|             status|       targetId|          targetName|aop_id|                name|
+--------------------+----------------+-----+--------------------+--------------------+--------------------+-------------------+---------------+--------------------+------+--------------------+
|N/A, Cyanosis occurs|             321|   31|Oxidation of iron...|[{321, N/A, Cyano...|https://aopwiki.o...|  Under Development|           null|hemoglobin subuni...|    31|               mouse|
|N/A, Cyanosis occurs|             321|   31|Oxidation of iron...|[{321, N/A, Cyano...|https://aopwiki.o...|  Under Development|ENSG00000160211|glucose-6-phospha...|    31|               mouse|
|Impairment, Learn...|        

In [53]:
df.toPandas().head()

Unnamed: 0,adverseOutcome,adverseOutcomeId,aopId,aopName,keyEvents,reference,status,targetId,targetName
0,"Altered, Amphibian metamorphosis",1101,193,Dual oxidase (DUOX) inhibition leading to alte...,"[(1101, Altered, Amphibian metamorphosis), (27...",https://aopwiki.org/aops/193,,ENSG00000137857,dual oxidase 1
1,"Impairment, T-cell dependent antibody response",984,154,Inhibition of Calcineurin Activity Leading to ...,"[(979, Interference, nuclear localization of N...",https://aopwiki.org/aops/154,EAGMST Under Review,ENSG00000235568,NFAT activation molecule 1
2,"Death/Failure, Colony",563,77,Nicotinic acetylcholine receptor activation co...,"[(560, Abnormal, Foraging activity and behavio...",https://aopwiki.org/aops/77,,,calmodulin
3,"Death/Failure, Colony",563,87,Nicotinic acetylcholine receptor activation co...,"[(560, Abnormal, Foraging activity and behavio...",https://aopwiki.org/aops/87,,,calmodulin
4,"Death/Failure, Colony",563,88,Nicotinic acetylcholine receptor activation co...,"[(560, Abnormal, Foraging activity and behavio...",https://aopwiki.org/aops/88,,,calmodulin


In [52]:
df.printSchema()

root
 |-- adverseOutcome: string (nullable = true)
 |-- adverseOutcomeId: long (nullable = true)
 |-- aopId: long (nullable = true)
 |-- aopName: string (nullable = true)
 |-- keyEvents: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- keyEventId: long (nullable = true)
 |    |    |-- keyEventName: string (nullable = true)
 |-- reference: string (nullable = true)
 |-- status: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- targetName: string (nullable = true)



In [54]:
df_enriched.toPandas().head()

Unnamed: 0,adverseOutcome,adverseOutcomeId,aopId,aopName,keyEvents,reference,status,targetId,targetName,aop_id,name
0,"N/A, Cyanosis occurs",321,31,Oxidation of iron in hemoglobin leading to hem...,"[(321, N/A, Cyanosis occurs), (246, Increase, ...",https://aopwiki.org/aops/31,Under Development,,hemoglobin subunit alpha (human),31,mouse
1,"N/A, Cyanosis occurs",321,31,Oxidation of iron in hemoglobin leading to hem...,"[(321, N/A, Cyanosis occurs), (246, Increase, ...",https://aopwiki.org/aops/31,Under Development,ENSG00000160211,glucose-6-phosphate 1-dehydrogenase,31,mouse
2,"Impairment, Learning and memory",341,13,Chronic binding of antagonist to N-methyl-D-as...,"[(381, Reduced levels of BDNF), (52, Decreased...",https://aopwiki.org/aops/13,TFHA/WNT Endorsed,ENSG00000176697,brain-derived neurotrophic factor,13,Felis catus
3,,341,13,Chronic binding of antagonist to N-methyl-D-as...,"[(381, Reduced levels of BDNF), (52, Decreased...",https://aopwiki.org/aops/13,TFHA/WNT Endorsed,ENSG00000176697,brain-derived neurotrophic factor,13,Felis catus
4,Treatment-resistant gastric cancer,1651,298,Chronic reactive oxygen species leading to hum...,"[(1755, Proliferation / beta-catenin activatio...",https://aopwiki.org/aops/298,EAGMST Under Review,ENSG00000102312,protein-serine O-palmitoleoyltransferase porcu...,298,Rattus sp. ABTC 42503
