In [28]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

spark = SparkSession.builder.getOrCreate()

# Disease to target relationships based on genetics

In [89]:
CLINVAR_VALIDS = [
    # ClinVar evidence we are interested
    "affects",
    "risk factor",
    "pathogenic",
    "likely pathogenic",
    "protective",
    "drug response",
]

SOURCES_OF_INTEREST = [
    # Genetic evidence
    "uniprot_variants",
    "uniprot_literature",
    "gene_burden",
    "orphanet",
    "clingen",
    "eva",
    "gene2phenotype",
    "ot_genetics_portal",
    # somatic
    "cancer_gene_census",
    "eva_somatic",
    "intogen",
    # mouse models
    "impc",
    # we include chembl as a benchmark
    "chembl"
]

def expand_disease_index(disease):
    """Expand disease index to include ancestors to account for differences in granularity in the mapping."""
    return (
        disease.select(
            f.col("id").alias("diseaseId"),
            f.explode("ancestors").alias("propagatedDiseaseId"),
        )
        .union(
            disease.select(
                f.col("id").alias("diseaseId"), f.col("id").alias("propagatedDiseaseId")
            )
        )
        .distinct()
    )

def prepare_genetic_associations(evidence, disease_ancestors):
    """Prepare a pseudo-associations dataset that consists of propagating the ontology across the evidence dataset and extract the maximum score per data source."""
    return (
        
        # Cleaned evidence (exclude "benign" clinvar genetic evidence)
        evidence.withColumn("evaValids", f.array([f.lit(x) for x in CLINVAR_VALIDS]))
        .withColumn("evaFilter", f.arrays_overlap("evaValids", "clinicalSignificances"))
        .filter((f.col("evaFilter").isNull()) | (f.col("evaFilter")))
        
        # Restrict the evidence set to those of genetic origin
        .filter(f.col("datasourceId").isin(SOURCES_OF_INTEREST))
        
        # pseudo-associations: ontology propagation + max datasource score
        .join(disease_ancestors, on="diseaseId", how="left")
        .drop("diseaseId")
        .withColumnRenamed("propagatedDiseaseId", "diseaseId")
        .select("targetId", "diseaseId", "datasourceId")
        .distinct()
    )

def prepare_probes_data(target):
    """Prepare a dataset of drug/target relationships established by probes data."""
    return (
        target.filter(f.col("chemicalProbes").isNotNull())
        .select(f.col("id").alias("targetId"), f.explode(f.col("chemicalProbes.drugId")).alias("drugId"))
    )

In [91]:
evidence_path = "gs://open-targets-pre-data-releases/23.09/output/etl/parquet/evidence/"
disease_path = "gs://open-targets-pre-data-releases/23.09/output/etl/parquet/diseases"
target_path = "gs://open-targets-pre-data-releases/23.09/output/etl/parquet/targets"

evidence = spark.read.parquet(evidence_path)
disease = spark.read.parquet(disease_path)

disease_ancestors = expand_disease_index(disease)
associations = prepare_genetic_associations(evidence, disease_ancestors).persist()

total_number_assocs = associations.select("targetId", "diseaseId").count()

assert associations.select("datasourceId").distinct().count() == len(SOURCES_OF_INTEREST), "Sources are missing from the associations set"

associations.show()

print("TOTAL NUMBER OF ASSOCIATIONS:", total_number_assocs)

23/09/20 13:59:46 WARN CacheManager: Asked to cache already cached data.


+---------------+---------------+------------+
|       targetId|      diseaseId|datasourceId|
+---------------+---------------+------------+
|ENSG00000110921|  MONDO_0019240|         eva|
|ENSG00000166147|   OTAR_0000018|         eva|
|ENSG00000139618|  MONDO_0021350|         eva|
|ENSG00000149311|    EFO_0005771|         eva|
|ENSG00000144554|  MONDO_0100137|         eva|
|ENSG00000171298|    EFO_0003777|         eva|
|ENSG00000164692|   OTAR_0000006|         eva|
|ENSG00000181027|   OTAR_0000020|         eva|
|ENSG00000101935|  MONDO_0019824|         eva|
|ENSG00000042781|    EFO_1001455|         eva|
|ENSG00000106692|  MONDO_0020120|         eva|
|ENSG00000138823|  MONDO_0020044|         eva|
|ENSG00000006071|  MONDO_0009734|         eva|
|ENSG00000158887|    EFO_0004149|         eva|
|ENSG00000173409|  MONDO_0100038|         eva|
|ENSG00000244734|Orphanet_183651|         eva|
|ENSG00000164362|    EFO_0004244|         eva|
|ENSG00000135925|  MONDO_0019268|         eva|
|ENSG00000196

# Drug to disease relationships based on clinics

In [92]:
def prepare_indications(indications):
    "Prepares a dataset of drug to disease relationships"
    return (
        indications
        .select(f.col("id").alias("drugId"), f.explode("indications.disease").alias("diseaseId"))
        .distinct()
    )

In [93]:
indications_path = "gs://open-targets-pre-data-releases/23.09/output/etl/parquet/indication"

indications = spark.read.parquet(indications_path)
custom_indications = prepare_indications(indications).persist()

print(f"Total drug/disease relationships: {custom_indications.count()}")

custom_indications.show(5)

Total drug/disease relationships: 55249
+------------+-----------+
|      drugId|  diseaseId|
+------------+-----------+
|    CHEMBL88|EFO_0003840|
|    CHEMBL88|EFO_0003833|
|    CHEMBL88|EFO_0006738|
|CHEMBL360328|EFO_0005611|
|CHEMBL360328|EFO_0000319|
+------------+-----------+
only showing top 5 rows



23/09/20 14:00:01 WARN CacheManager: Asked to cache already cached data.


# Building target to drug relationships to study their activity data

2 main sources:
- The result of combining the dataset of genetic evidence + clinical data
- Using chemical probes relationships

In [94]:
def prepare_probes_data(target):
    """Prepare a dataset of drug/target relationships established by probes data."""
    return (
        target
        .withColumn("probe", f.explode("chemicalProbes"))
        .select(
            f.col("id").alias("targetId"),
            f.col("probe.drugId"),
            f.col("probe.isHighQuality").alias("isHighQualityProbe"),
            f.lit("chemicalProbes").alias("datasourceId")
        )
        .filter(f.col("drugId").isNotNull())
        .distinct()
    )

def ens_to_uniprot(target):
    return (
        target
        .withColumn("proteinId", f.explode("proteinIds"))
        .filter(f.col("proteinId.source") == "uniprot_swissprot") 
        .select(f.col("id").alias("targetId"), f.col("proteinId.id").alias("uniprotId"))
        .distinct()
    )

target = spark.read.parquet(target_path)

probes = prepare_probes_data(target)
uniprot_lut = ens_to_uniprot(target)

probes.show(5)

print(f"Number of target/drug relationships from probes: {probes.count()}")



                                                                                

+---------------+-------------+------------------+--------------+
|       targetId|       drugId|isHighQualityProbe|  datasourceId|
+---------------+-------------+------------------+--------------+
|ENSG00000130758|CHEMBL2436978|             false|chemicalProbes|
|ENSG00000112742|CHEMBL3109933|             false|chemicalProbes|
|ENSG00000112742|CHEMBL3422083|             false|chemicalProbes|
|ENSG00000178999|CHEMBL4206831|              true|chemicalProbes|
|ENSG00000122025|CHEMBL3290626|             false|chemicalProbes|
+---------------+-------------+------------------+--------------+
only showing top 5 rows



[Stage 436:===>                                                   (1 + 15) / 16]

Number of target/drug relationships from probes: 4714


                                                                                

In [95]:
# Joining everything

drug_to_target = (
    associations.join(custom_indications, on="diseaseId", how="inner")
    .unionByName(probes, allowMissingColumns=True)
    # get uniprots
    .join(uniprot_lut, on="targetId")
    .select(
        "drugId",
        "uniprotId",
        "targetId",
        # metadata of the source of the relationship
        "diseaseId",
        "datasourceId",
        "isHighQualityProbe"
    )
    .groupBy(
        "drugId",
        "uniprotId",
        "targetId"
    )
    .agg(
        f.collect_set("datasourceId").alias("sources"),
        f.collect_set("isHighQualityProbe").alias("isHighQualityProbe")
    )
    .withColumn("isTherapeuticTarget", f.when(f.array_contains(f.col("sources"), "chembl"), f.lit(True)).otherwise(f.lit(False)))
    .withColumn("isHighQualityProbe", f.when(f.array_contains(f.col("isHighQualityProbe"), True), f.lit(True)).otherwise(f.lit(False)))
    .persist()
)


drug_to_target.show()

print(f"Number of drug/target relationships: {drug_to_target.count()}")

                                                                                

+------------+---------+---------------+--------------------+------------------+-------------------+
|      drugId|uniprotId|       targetId|             sources|isHighQualityProbe|isTherapeuticTarget|
+------------+---------+---------------+--------------------+------------------+-------------------+
|  CHEMBL1000|   A6NM76|ENSG00000185821|[ot_genetics_portal]|             false|              false|
|  CHEMBL1000|   P01225|ENSG00000131808|[impc, ot_genetic...|             false|              false|
|  CHEMBL1000|   P05113|ENSG00000113525|[impc, ot_genetic...|             false|               true|
|  CHEMBL1000|   P49715|ENSG00000245848|[ot_genetics_portal]|             false|              false|
|  CHEMBL1000|   Q02447|ENSG00000172845|[ot_genetics_portal]|             false|              false|
|  CHEMBL1000|   Q14164|ENSG00000263528|[ot_genetics_portal]|             false|              false|
|  CHEMBL1000|   Q92989|ENSG00000172409|              [impc]|             false|           



Number of drug/target relationships: 28921105


                                                                                

In [97]:
drug_to_target.write.parquet("gs://ot-team/irene/drug_to_target")

                                                                                