In [None]:
from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [2]:
new_epmc_path = '/Users/irene/Documents/dev/pyspark/22.04/epmc-2022-04-21.json.gz'
old_epmc_path = '/Users/irene/Documents/dev/pyspark/22.02/epmc-2022-02-01.json.gz'

new_epmc_df = spark.read.json(new_epmc_path).repartition(200)
old_epmc_df = spark.read.json(old_epmc_path).repartition(200)

                                                                                

In [7]:
new_assocs_path = '/Users/irene/Documents/dev/pyspark/22.04.1/associationByDatasourceDirect'
old_assocs_path = '/Users/irene/Documents/dev/pyspark/22.02.4/associationByDatasourceDirect'

new_assocs_df = spark.read.parquet(new_assocs_path).filter(F.col('datasourceId') == 'europepmc')
old_assocs_df = spark.read.parquet(old_assocs_path).filter(F.col('datasourceId') == 'europepmc')

## Check missing associations

In [13]:
missing_assocs = old_assocs_df.join(new_assocs_df, on=['diseaseId', 'targetId'], how='left_anti').distinct()

print(f"Missing associations: {missing_assocs.count()} / {missing_assocs.select('diseaseId').distinct().count()} distinct diseases")

[Stage 64:>                                                         (0 + 8) / 8]

Missing associations: 104674 / 5381 distinct diseases


                                                                                

In [27]:
# Some missing associations

missing_assocs.select('diseaseId', 'targetId').distinct().filter(F.col('diseaseId').startswith('EFO')).show(30)



+-----------+---------------+
|  diseaseId|       targetId|
+-----------+---------------+
|EFO_0000094|ENSG00000026103|
|EFO_0000094|ENSG00000111142|
|EFO_0000094|ENSG00000135318|
|EFO_0000094|ENSG00000135446|
|EFO_0000094|ENSG00000170345|
|EFO_0000095|ENSG00000112038|
|EFO_0000095|ENSG00000119950|
|EFO_0000095|ENSG00000134757|
|EFO_0000095|ENSG00000137267|
|EFO_0000095|ENSG00000153802|
|EFO_0000095|ENSG00000161638|
|EFO_0000095|ENSG00000172270|
|EFO_0000095|ENSG00000184916|
|EFO_0000095|ENSG00000186350|
|EFO_0000095|ENSG00000204977|
|EFO_0000095|ENSG00000205336|
|EFO_0000095|ENSG00000236534|
|EFO_0000174|ENSG00000099942|
|EFO_0000174|ENSG00000100285|
|EFO_0000174|ENSG00000104972|
|EFO_0000174|ENSG00000109182|
|EFO_0000174|ENSG00000137504|
|EFO_0000174|ENSG00000167685|
|EFO_0000174|ENSG00000170989|
|EFO_0000174|ENSG00000172845|
|EFO_0000174|ENSG00000183929|
|EFO_0000174|ENSG00000185630|
|EFO_0000174|ENSG00000231712|
|EFO_0000178|ENSG00000101890|
|EFO_0000178|ENSG00000104332|
+---------

                                                                                

In [14]:
# Some diseases being part of the missing associations

missing_assocs.select('diseaseId').distinct().show()

[Stage 78:>                                                         (0 + 8) / 8]

+--------------+
|     diseaseId|
+--------------+
|     DOID_7551|
|   EFO_0004254|
|   EFO_0007391|
|   EFO_1000179|
|   EFO_1002027|
|    HP_0012076|
| MONDO_0002171|
| MONDO_0019473|
|Orphanet_98261|
|   EFO_0000569|
|   EFO_0005203|
|   EFO_0005556|
|   EFO_0005585|
|   EFO_0009000|
|   EFO_0009065|
|   EFO_0010580|
|   EFO_1001341|
|    HP_0001263|
|    HP_0002167|
| MONDO_0001110|
+--------------+
only showing top 20 rows



                                                                                

In [23]:
# Breakdown of the diseases per ontology

missing_assocs.select('diseaseId').distinct().withColumn('ontology', F.split(F.col('diseaseId'), '_').getItem(0)).groupBy('ontology').count().show()

[Stage 136:>                                                        (0 + 8) / 8]

+-------------------+-----+
|           ontology|count|
+-------------------+-----+
|           Orphanet| 1534|
|                 GO|    6|
|                 HP|  426|
|               OTAR|    2|
|                EFO| 2215|
|              MONDO| 1187|
|               DOID|    6|
|                OBI|    1|
|                 MP|    1|
|//ebi.ac.uk/efo/EFO|    3|
+-------------------+-----+



                                                                                

In [26]:
# Diseases look legit, let's see an example

# This previous assoc was not good, VH refers to visual hallucinations, not IGHV4-61
old_epmc_df.filter((F.col('diseaseFromSourceMappedId') == 'DOID_0050890') & (F.col('targetFromSourceId') == 'ENSG00000211970')).show(1, False, True)

                                                                                

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 datasourceId              | europepmc                                                                                                                                                                                                                                                                                                                                                                                                                                  
 datatypeId                | literature                                               

In [28]:
# Another example: this one seems legit.

old_epmc_df.filter((F.col('diseaseFromSourceMappedId') == 'EFO_0000094') & (F.col('targetFromSourceId') == 'ENSG00000026103')).show(1, False, True)

                                                                                

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------
 datasourceId              | europepmc                                                                                                                             
 datatypeId                | literature                                                                                                                            
 diseaseFromSourceMappedId | EFO_0000094                                                                                                                           
 literature                | [25755857]                                                                                                                            
 pmcIds                    | [PMC4339559]                                                                                                                          
 resourceScore  

In [None]:
old_epmc_df.filter((F.col('diseaseFromSourceMappedId') == 'DOID_0050890') & (F.col('targetFromSourceId') == 'ENSG00000211970')).show(1, False, True)

## Inspect new associations

In [29]:
novelty = (
    new_epmc_df.join(old_epmc_df, on=['literature', 'targetFromSourceId', 'diseaseFromSourceMappedId'], how='left_anti').distinct()
)

novelty.first()

                                                                                

Row(literature=['22277255'], targetFromSourceId='ENSG00000159640', diseaseFromSourceMappedId='Orphanet_685', datasourceId='europepmc', datatypeId='literature', pmcIds=None, resourceScore=5.0, textMiningSentences=[Row(dEnd=125, dStart=122, section='abstract', tEnd=37, tStart=34, text='This meta-analysis shows that the ACE I/D polymorphism is associated with vasculitis susceptibility, especially in BD and HSP.')])

In [30]:
novelty.count()

                                                                                

4063636