In [1]:
import pandas as pd

## 1. Extract all compound IDs that belong to a Chemical Probes Set

Compound sets per data source:
```
Bromodomains chemical toolbox, 277
Chemical Probes.org, 15
Gray Laboratory Probes, 240
MLP probes, 14 --> Legacy compound set
Open Science Probes, 213
High-quality chemical probes, 395 --> Turn into a boolean label
Nature Chemical Biology Probes, 19 --> Legacy compound set
opnMe Portal, 219
Probe Miner (suitable probes), 227
Protein methyltransferases chemical toolbox, 278
SGC Probes, 28
Tool Compound Set, 19`
```


In [4]:
compound2compoundset = pd.read_csv("/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundtocompoundset.csv", sep=";", header=0, usecols=['compound_id', 'compoundset_id'])
compoundsets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundset.csv",
        sep=";",
        header=0,
        usecols=['compoundsetid', 'name', 'source_url'])
    .rename(columns={'name' : 'source'})
)

In [5]:
probes_compoundsetids = [277, 15, 240, 213, 395, 219, 227, 278, 28, 19]

compound2compoundset = (
    compound2compoundset.merge(
        compoundsets[['compoundsetid', 'source', 'source_url']],
        left_on='compoundset_id',
        right_on='compoundsetid',
        how='inner'
    )
    # Select compounds coming from Probes data sources
    .query('compoundset_id in @probes_compoundsetids')
    .drop(['compoundsetid', 'compoundset_id'], axis=1)
)

print(compound2compoundset.shape[0])
compound2compoundset.head()

4479


Unnamed: 0,compound_id,source,source_url
29444,5105,Chemical Probes.org,http://www.chemicalprobes.org/
29445,14759,Chemical Probes.org,http://www.chemicalprobes.org/
29446,9448,Chemical Probes.org,http://www.chemicalprobes.org/
29447,6072,Chemical Probes.org,http://www.chemicalprobes.org/
29448,5011,Chemical Probes.org,http://www.chemicalprobes.org/


# 🧐 Question: If probe in different sources, do we duplicate or want the id to be unique? Unique ID

## 2.1 Pair compound IDs with probes IDs

In [6]:
probes = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.probe.csv",
        sep=";",
        header=0,
        usecols=['probeid', 'compound_id', 'control', 'origin_id'],
        dtype={'probeid':'int64', 'control':'category'})
    .rename(columns={'probeid':'probe_id', 'origin_id':'origin'})
)
probes.head(5)

Unnamed: 0,probe_id,compound_id,origin,control
0,19154,6003,calculated,
1,15124,6317,experimental,
2,15085,6337,experimental,
3,15098,7180,experimental,
4,19148,7499,calculated,['GSK990']


In [7]:
# Controls are read as strings
def str_to_list(X):
    if isinstance(X, str):
        arr = X.strip('[]').split(',')
        arr = [e.strip("''") for e in arr]
        return arr
    return None
    
    
probes['control'] = probes['control'].apply(lambda X: str_to_list(X))

probes = probes.explode('control')

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


In [8]:
probes.head()

Unnamed: 0,probe_id,compound_id,origin,control
0,19154,6003,calculated,
1,15124,6317,experimental,
2,15085,6337,experimental,
3,15098,7180,experimental,
4,19148,7499,calculated,GSK990


In [9]:
# There are 13 compounds not present in the probes dataset
# All coming from 'High-quality chemical probes'

df = compound2compoundset.merge(
    probes,
    on='compound_id',
    how='left'
)

df.head(5)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,


## 2.2 Extract compound name and inchikey

In [10]:
compounds = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compound.csv",
        sep=";",
        header=0,
        usecols=['compoundid', 'name', 'inchikey'])
    .rename(columns={'compoundid':'compound_id', 'name':'compound_name', 'inchikey':'inchiKey'})
)

In [11]:
df = df.merge(
    compounds,
    on='compound_id',
    how='left'
)

df.head(5)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N


## 3. Extract target associated with probe

In [12]:
# First extract target_id
probes2targets = pd.read_csv("/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.probetobasetarget.csv", 
                             sep=";",
                             header=0,
                             usecols=['probe_id', 'basetarget_id'])
                             

df = df.merge(
    probes2targets,
    on='probe_id',
    how='left'
)

Count of probes without an associated target per datasource:
```
MLP Probes                                     114
High-quality chemical probes                    44
Nature Chemical Biology Probes                  39
Chemical Probes.org                             25
Protein methyltransferases chemical toolbox     20
SGC Probes                                      20
Open Science Probes                              3
Gray Laboratory Probes                           2
opnMe Portal                                     1
Probe Miner (suitable probes)                    1
```

In [13]:
# Then extract target name

base_targets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.basetarget.csv",
        sep=";",
        header=0,
        usecols=['basetargetid', 'name', 'gene_name'])
    .rename(columns={'basetargetid':'basetarget_id', 'name':'target_name', 'gene_name':'target_symbols'})
)

In [14]:
df = df.merge(
    base_targets,
    on='basetarget_id',
    how='left'
)

df.head(10)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,12775.0,Aurora kinase A,AURKA
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,10920.0,Aurora kinase B,AURKB
5,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,11391.0,Aurora kinase C,AURKC
6,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,12775.0,Aurora kinase A,AURKA
7,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,10920.0,Aurora kinase B,AURKB
8,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,11391.0,Aurora kinase C,AURKC
9,9448,Chemical Probes.org,http://www.chemicalprobes.org/,14766.0,experimental,,XL019,ISOCDPQFIXDIMS-QHCPKHFHSA-N,13812.0,Tyrosine-protein kinase JAK2,JAK2


In [15]:
# Extract uniprot ID from target

basetarget2target = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.targettobasetarget.csv",
        sep=";",
        header=0,
        usecols=['basetarget_id', 'target_id']))

targets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.target.csv",
        sep=";",
        header=0,
        usecols=['targetid', 'organism_id', 'uniprotid'])
    .rename(columns={'targetid': 'target_id', 'uniprotid':'uniprot_ids'})
)

In [16]:
df = (
    # Map from basetarget to target
    df.merge(
        basetarget2target,
        on='basetarget_id',
        how='left'
    )
    # Extract organism_id and uniprot_ids from target
    .merge(
        targets,
        on='target_id',
        how='left'
    )
    # Select human as organism (organism_id = [5428, 2832])
    .query('organism_id in [5428, 2832]')
)


In [17]:
# Split and explode the uniprot_ids columns

df['uniprot_id'] = df['uniprot_ids'].apply(lambda X: X.split(','))
df = df.explode('uniprot_id')

In [18]:
df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols,target_id,organism_id,uniprot_ids,uniprot_id
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59988.0,2832.0,Q13164,Q13164
5,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59989.0,2832.0,Q13164,Q13164
6,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59990.0,2832.0,Q13164,Q13164


## 4. Extract scores

In [21]:
scores = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundtargetscore.csv",
        sep=";",
        header=0,
        usecols=['compound_id', 'basetarget_id', 'percentage', 'score_id'])
    # Filter with only score_ids of interest
    .query('score_id in [1, 5, 6, 7]')
)

df_s = df.merge(
        scores,
        on=['basetarget_id', 'compound_id'],
        how='left'
)

In [22]:
df_s

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols,target_id,organism_id,uniprot_ids,uniprot_id,score_id,percentage
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,5.0,66.75
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,6.0,25.00
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,7.0,60.00
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164,5.0,66.75
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164,6.0,25.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71379,109419,High-quality chemical probes,,15266.0,experimental,BAY-173,BAY-3153,LZSPYTNDRPEZBK-OXYJHPMESA-N,14213.0,C-C chemokine receptor type 1,CCR1,45404.0,2832.0,P32246,P32246,7.0,70.00
71380,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,67796.0,2832.0,Q7Z601,Q7Z601,7.0,20.00
71381,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,54436.0,2832.0,Q7Z601,Q7Z601,7.0,20.00
71382,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,54437.0,2832.0,Q7Z601,Q7Z601,7.0,20.00


In [23]:
df_s["probesDrugsScore"] = df_s.apply(lambda X: X.percentage if X.score_id == 7 else None, axis=1)
df_s['probeMinerScore'] = df_s.apply(lambda X: X.percentage if X.score_id == 1 else None, axis=1)
df_s['scoreInCells'] = df_s.apply(lambda X: X.percentage if X.score_id == 5 else None, axis=1)
df_s['scoreInOrganisms'] = df_s.apply(lambda X: X.percentage if X.score_id == 6 else None, axis=1)

In [24]:
# Right now I have duplicated rows due to:
# several data sources evidencing the same assoc +
# exploded uniprot ids +
# I have to coalesce the score rows

In [25]:
df = df_s.copy()

## 5. Extract MoA

In [26]:
moas = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundaction.csv",
        sep=";",
        header=0,
        usecols=['compound_id', 'target_id', 'actiontype_id'])
)

In [27]:
df = df.merge(
        moas,
        on=['target_id', 'compound_id'],
        how='left'
)

In [28]:
df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,...,organism_id,uniprot_ids,uniprot_id,score_id,percentage,probesDrugsScore,probeMinerScore,scoreInCells,scoreInOrganisms,actiontype_id
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,5.0,66.75,,,66.75,,inhibitor
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,6.0,25.0,,,,25.0,inhibitor
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,7.0,60.0,60.0,,,,inhibitor
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,5.0,66.75,,,66.75,,
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,6.0,25.0,,,,25.0,


## 6. Map to drugId

In [3]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (SparkSession.builder
    .appName('spark')
    .getOrCreate())

In [30]:

drug_idx = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/21.06.5/molecule')
    .select(col('id').alias('drugId'), 'inchiKey')
    .toPandas()
)

In [32]:
df = (
    # Get drugId
    df.merge(
        drug_idx,
        on='inchiKey',
        how='left')
)


df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,...,uniprot_ids,uniprot_id,score_id,percentage,probesDrugsScore,probeMinerScore,scoreInCells,scoreInOrganisms,actiontype_id,drugId
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,Q13164,5.0,66.75,,,66.75,,inhibitor,
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,Q13164,6.0,25.0,,,,25.0,inhibitor,
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,Q13164,7.0,60.0,60.0,,,,inhibitor,
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,Q13164,5.0,66.75,,,66.75,,,
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,Q13164,6.0,25.0,,,,25.0,,


In [33]:
df = df.astype({
    'probe_id': 'Int64',
    'basetarget_id': 'Int64',
    'target_id': 'Int64',
    'organism_id': 'Int64',
    'score_id': 'Int64'
})

In [34]:
df.to_csv('output/chemprobes_v1.tsv', sep='\t', header=True, index=False)

In [35]:
df.iloc[0]

compound_id                                       5105
source                             Chemical Probes.org
source_url              http://www.chemicalprobes.org/
probe_id                                         14769
origin                                    experimental
control                                            NaN
compound_name                                  XMD8-92
inchiKey                   QAPAJIZPZGWAND-UHFFFAOYSA-N
basetarget_id                                    15105
target_name         Mitogen-activated protein kinase 7
target_symbols                                   MAPK7
target_id                                        66118
organism_id                                       2832
uniprot_ids                                     Q13164
uniprot_id                                      Q13164
score_id                                             5
percentage                                       66.75
probesDrugsScore                                   NaN
probeMiner

## 7. Build final dataset

In [4]:
data = spark.read.csv('output/chemprobes_v1.tsv', sep='\t', header=True)

All posible URLs
```
+-------------------------------------------+---------------------------------------------------------------+
|source                                     |source_url                                                     |
+-------------------------------------------+---------------------------------------------------------------+
|Probe Miner (suitable probes)              |http://probeminer.icr.ac.uk/#/                                 |
|High-quality chemical probes               |null                                                           |
|Chemical Probes.org                        |http://www.chemicalprobes.org/                                 |
|opnMe Portal                               |https://opnme.com/                                             |
|Bromodomains chemical toolbox              |https://www.nature.com/articles/s41467-019-09672-2#article-info|
|SGC Probes                                 |http://www.thesgc.org/chemical-probes                          |
|Gray Laboratory Probes                     |http://graylab.dana-farber.org/probes.html#                    |
|Protein methyltransferases chemical toolbox|https://www.nature.com/articles/s41467-018-07905-4             |
|Nature Chemical Biology Probes             |http://www.nature.com/nchembio/chemical_probes.html            |
|Open Science Probes                        |http://www.sgc-ffm.uni-frankfurt.de/#!start `

```


In [5]:
data.first()

Row(compound_id='5105', source='Chemical Probes.org', source_url='http://www.chemicalprobes.org/', probe_id='14769', origin='experimental', control=None, compound_name='XMD8-92', inchiKey='QAPAJIZPZGWAND-UHFFFAOYSA-N', basetarget_id='15105', target_name='Mitogen-activated protein kinase 7', target_symbols='MAPK7', target_id='66118', organism_id='2832', uniprot_ids='Q13164', uniprot_id='Q13164', score_id='5', percentage='66.75', probesDrugsScore=None, probeMinerScore=None, scoreInCells='66.75', scoreInOrganisms=None, actiontype_id='inhibitor', drugId=None)

In [24]:
(data
.filter(col('compound_id') == '888')
).show(10, False, True)

-RECORD 0-----------------------------------------------------------------------------------------
 compound_id      | 888                                                                           
 source           | High-quality chemical probes                                                  
 source_url       | null                                                                          
 probe_id         | 18805                                                                         
 origin           | calculated                                                                    
 control          | null                                                                          
 compound_name    | TGX-221                                                                       
 inchiKey         | CPRAGQJXBLMUEL-UHFFFAOYSA-N                                                   
 basetarget_id    | 13701                                                                         
 target_na

In [35]:
data_m = (data
    # Drop columns that are not needed
    .drop('compound_id', 'basetarget_id', 'target_id', 'organism_id', 'score_id', 'percentage', 'uniprot_ids', 'probe_id')
    # Change Probe Miner source name
    .withColumn('source', when(col('source').contains('Probe Miner'), 'Probe Miner').otherwise(col('source')))
    # Build more complete URLs
    .withColumn(
    'source_url',
    when(col('source_url').contains('probeminer'), concat('source_url', 'uniprot_id'))
    .when(col('source_url').contains('probes.org'), concat(lit('https://new.chemicalprobes.org/?q='), 'compound_name'))
    .when(col('source_url').contains('thesgc'), concat('source_url', lit('/'), 'compound_name'))
    .when(col('source_url').contains('graylab'), lit(None))
    .when(col('source_url').contains('opnme'), concat(lit('https://opnme.com/search-results/'), 'compound_name'))
    .when(col('source_url').contains('frankfurt'),
          concat(
              element_at(split(col('source_url'), '#!start'), 1),
              lit('#!specificprobeoverview/'),
              'compound_name'))
    .otherwise(col('source_url')))
    # Create high-quality flag
    .withColumn('isHighQuality', when(col('source') == 'High-quality chemical probes', True).otherwise(False))
    # Clean rows where control = 1
    .withColumn('control', when(~col('control').contains(1), col('control')))
    .withColumn('control', when(~col('control').contains('2S'), col('control')))
    # Group data and collect sets
    .groupBy('uniprot_id', 'compound_name', 'drugId', 'inchiKey')
    .agg(
        collect_set(
            struct(
                col('source').alias('niceName'),
                col('source_url').alias('url'))
        ).alias('urls'),
        flatten(collect_set(array("control"))).alias('control'), # max size = 1
        collect_set("probesDrugsScore").alias('probesDrugsScore'),
        collect_set("probeMinerScore").alias('probeMinerScore'),
        collect_set("scoreInCells").alias('scoreInCells'),
        collect_set("scoreInOrganisms").alias('scoreInOrganisms'),
        flatten(collect_set(array("actiontype_id"))).alias('mechanismOfAction'),
        flatten(collect_set(array("isHighQuality"))).alias('isHighQuality'),
        collect_set("origin").alias("origin")
    )
    .withColumn('probesDrugsScore', array_max(col('probesDrugsScore')))
    .withColumn('probeMinerScore', array_max(col('probeMinerScore')))
    .withColumn('scoreInCells', array_max(col('scoreInCells')))
    .withColumn('scoreInOrganisms', array_max(col('scoreInOrganisms')))
    .withColumn('mechanismOfAction', expr('filter(mechanismOfAction, x -> x is not null)'))
    .withColumn('control', explode(col('control')))
    .withColumnRenamed('compound_name', 'id')
    .withColumnRenamed('uniprot_id', 'targetFromSourceId')
    .withColumn('urls', array_distinct(col('urls')))
    # Remove reference to High Quality Probes source as this is a subset curated by P&Ds
    # Problem: there are compounds only referenced with High Quality Probes 
    # Solution: remove it if there is already a reference, otherwise reference P&Ds 
    .withColumn('urls', array_except(col('urls'),
                              array(struct(lit('High-quality chemical probes').alias("niceName"), lit(None).cast("string").alias("url")))))
    .withColumn( 
        'urls',
        when(
            size(col('urls')) == 0,
            array(struct(
                lit('Probes & Drugs Portal').alias('niceName'),
                lit('https://www.probes-drugs.org').alias('url')))
        )
        .otherwise(col('urls'))
    )
    # Convert empty arrays into null
    .withColumn('mechanismOfAction', when(size(col('mechanismOfAction')) == 0, lit(None)).otherwise(col('mechanismOfAction')))
    .withColumn('origin', when(size(col('origin')) == 0, lit(None)).otherwise(col('origin')))
    # Transform isHighQuality to keep only True
    .withColumn('isHighQuality', when(array_contains(col('isHighQuality'), True), True).otherwise(False))
    .distinct()
)

data_m.count()

4155

In [37]:
data_m.withColumn('url', explode('urls')).filter(col('url.niceName').contains('Drugs')).first()

Row(targetFromSourceId='O95977', id='ML131', drugId=None, inchiKey='XECIGFKOOKPZGH-UHFFFAOYSA-N', urls=[Row(niceName='Probes & Drugs Portal', url='https://www.probes-drugs.org')], control=None, probesDrugsScore='90.0', probeMinerScore='21.0', scoreInCells=None, scoreInOrganisms=None, mechanismOfAction=None, isHighQuality=True, origin=['experimental'], url=Row(niceName='Probes & Drugs Portal', url='https://www.probes-drugs.org'))

In [76]:
data_m.show(10, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 targetFromSourceId | P08246                                                                                                                                                                            
 id                 | BAY-678                                                                                                                                                                           
 drugId             | null                                                                                                                                                                              
 inchiKey           | PGIVGIFOWOVINL-GOSISDBHSA-N                                                                                                                                                   

In [38]:
data_m.coalesce(1).write.json('output/chemicalprobes_v7')