In [14]:
import pandas as pd

## 1. Extract all compound IDs that belong to a Chemical Probes Set

Compound sets per data source:
```
Bromodomains chemical toolbox, 277
Chemical Probes.org, 15
Gray Laboratory Probes, 240
MLP probes, 14 --> Legacy compound set
Open Science Probes, 213
High-quality chemical probes, 395 --> Turn into a boolean label
Nature Chemical Biology Probes, 19 --> Legacy compound set
opnMe Portal, 219
Probe Miner (suitable probes), 227
Protein methyltransferases chemical toolbox, 278
SGC Probes, 28
Tool Compound Set, 19`
```


In [308]:
compound2compoundset = pd.read_csv("/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundtocompoundset.csv", sep=";", header=0, usecols=['compound_id', 'compoundset_id'])
compoundsets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundset.csv",
        sep=";",
        header=0,
        usecols=['compoundsetid', 'name', 'source_url'])
    .rename(columns={'name' : 'source'})
)

In [309]:
probes_compoundsetids = [277, 15, 240, 213, 395, 219, 227, 278, 28, 19]

compound2compoundset = (
    compound2compoundset.merge(
        compoundsets[['compoundsetid', 'source', 'source_url']],
        left_on='compoundset_id',
        right_on='compoundsetid',
        how='inner'
    )
    # Select compounds coming from Probes data sources
    .query('compoundset_id in @probes_compoundsetids')
    .drop(['compoundsetid', 'compoundset_id'], axis=1)
)

print(compound2compoundset.shape[0])
compound2compoundset.head()

4479


Unnamed: 0,compound_id,source,source_url
29444,5105,Chemical Probes.org,http://www.chemicalprobes.org/
29445,14759,Chemical Probes.org,http://www.chemicalprobes.org/
29446,9448,Chemical Probes.org,http://www.chemicalprobes.org/
29447,6072,Chemical Probes.org,http://www.chemicalprobes.org/
29448,5011,Chemical Probes.org,http://www.chemicalprobes.org/


# 🧐 Question: If probe in different sources, do we duplicate or want the id to be unique? Unique ID

## 2.1 Pair compound IDs with probes IDs

In [310]:
probes = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.probe.csv",
        sep=";",
        header=0,
        usecols=['probeid', 'compound_id', 'control', 'origin_id'],
        dtype={'probeid':'int64', 'control':'category'})
    .rename(columns={'probeid':'probe_id', 'origin_id':'origin'})
)
probes.head(5)

Unnamed: 0,probe_id,compound_id,origin,control
0,19154,6003,calculated,
1,15124,6317,experimental,
2,15085,6337,experimental,
3,15098,7180,experimental,
4,19148,7499,calculated,['GSK990']


In [311]:
# Controls are read as strings
def str_to_list(X):
    if isinstance(X, str):
        arr = X.strip('[]').split(',')
        arr = [e.strip("''") for e in arr]
        return arr
    return None
    
    
probes['control'] = probes['control'].apply(lambda X: str_to_list(X))

probes = probes.explode('control')

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


In [312]:
probes.head()

Unnamed: 0,probe_id,compound_id,origin,control
0,19154,6003,calculated,
1,15124,6317,experimental,
2,15085,6337,experimental,
3,15098,7180,experimental,
4,19148,7499,calculated,GSK990


In [313]:
# There are 13 compounds not present in the probes dataset
# All coming from 'High-quality chemical probes'

df = compound2compoundset.merge(
    probes,
    on='compound_id',
    how='left'
)

df.head(5)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,


## 2.2 Extract compound name and inchikey

In [314]:
compounds = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compound.csv",
        sep=";",
        header=0,
        usecols=['compoundid', 'name', 'inchikey'])
    .rename(columns={'compoundid':'compound_id', 'name':'compound_name', 'inchikey':'inchiKey'})
)

In [315]:
df = df.merge(
    compounds,
    on='compound_id',
    how='left'
)

df.head(5)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N


## 3. Extract target associated with probe

In [316]:
# First extract target_id
probes2targets = pd.read_csv("/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.probetobasetarget.csv", 
                             sep=";",
                             header=0,
                             usecols=['probe_id', 'basetarget_id'])
                             

df = df.merge(
    probes2targets,
    on='probe_id',
    how='left'
)

Count of probes without an associated target per datasource:
```
MLP Probes                                     114
High-quality chemical probes                    44
Nature Chemical Biology Probes                  39
Chemical Probes.org                             25
Protein methyltransferases chemical toolbox     20
SGC Probes                                      20
Open Science Probes                              3
Gray Laboratory Probes                           2
opnMe Portal                                     1
Probe Miner (suitable probes)                    1
```

In [317]:
# Then extract target name

base_targets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.basetarget.csv",
        sep=";",
        header=0,
        usecols=['basetargetid', 'name', 'gene_name'])
    .rename(columns={'basetargetid':'basetarget_id', 'name':'target_name', 'gene_name':'target_symbols'})
)

In [318]:
df = df.merge(
    base_targets,
    on='basetarget_id',
    how='left'
)

df.head(10)

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14792.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,19200.0,calculated,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7
3,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,12775.0,Aurora kinase A,AURKA
4,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,10920.0,Aurora kinase B,AURKB
5,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14767.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,11391.0,Aurora kinase C,AURKC
6,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,12775.0,Aurora kinase A,AURKA
7,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,10920.0,Aurora kinase B,AURKB
8,14759,Chemical Probes.org,http://www.chemicalprobes.org/,14793.0,experimental,,Aurora inhibitor (compound 1),DFQAJLQXPSPNJE-UHFFFAOYSA-N,11391.0,Aurora kinase C,AURKC
9,9448,Chemical Probes.org,http://www.chemicalprobes.org/,14766.0,experimental,,XL019,ISOCDPQFIXDIMS-QHCPKHFHSA-N,13812.0,Tyrosine-protein kinase JAK2,JAK2


In [319]:
# Extract uniprot ID from target

basetarget2target = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.targettobasetarget.csv",
        sep=";",
        header=0,
        usecols=['basetarget_id', 'target_id']))

targets = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.target.csv",
        sep=";",
        header=0,
        usecols=['targetid', 'organism_id', 'uniprotid'])
    .rename(columns={'targetid': 'target_id', 'uniprotid':'uniprot_ids'})
)

In [320]:
df = (
    # Map from basetarget to target
    df.merge(
        basetarget2target,
        on='basetarget_id',
        how='left'
    )
    # Extract organism_id and uniprot_ids from target
    .merge(
        targets,
        on='target_id',
        how='left'
    )
    # Select human as organism (organism_id = [5428, 2832])
    .query('organism_id in [5428, 2832]')
)


In [321]:
# Split and explode the uniprot_ids columns

df['uniprot_id'] = df['uniprot_ids'].apply(lambda X: X.split(','))
df = df.explode('uniprot_id')

In [322]:
df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols,target_id,organism_id,uniprot_ids,uniprot_id
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59988.0,2832.0,Q13164,Q13164
5,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59989.0,2832.0,Q13164,Q13164
6,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59990.0,2832.0,Q13164,Q13164


## 4. Extract scores

In [323]:
scores = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundtargetscore.csv",
        sep=";",
        header=0,
        usecols=['compound_id', 'basetarget_id', 'percentage', 'score_id'])
    # Filter with only score_ids of interest
    .query('score_id in [1, 5, 6, 7]')
)

df_s = df.merge(
        scores,
        on=['basetarget_id', 'compound_id'],
        how='left'
)

In [324]:
df_s

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,target_symbols,target_id,organism_id,uniprot_ids,uniprot_id,score_id,percentage
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,5.0,66.75
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,6.0,25.00
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,66118.0,2832.0,Q13164,Q13164,7.0,60.00
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164,5.0,66.75
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,MAPK7,59987.0,2832.0,Q13164,Q13164,6.0,25.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71379,109419,High-quality chemical probes,,15266.0,experimental,BAY-173,BAY-3153,LZSPYTNDRPEZBK-OXYJHPMESA-N,14213.0,C-C chemokine receptor type 1,CCR1,45404.0,2832.0,P32246,P32246,7.0,70.00
71380,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,67796.0,2832.0,Q7Z601,Q7Z601,7.0,20.00
71381,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,54436.0,2832.0,Q7Z601,Q7Z601,7.0,20.00
71382,109421,High-quality chemical probes,,15393.0,experimental,BI-4420,BI-1046,MLOGCHDCTRINMU-UHFFFAOYSA-N,17345.0,Probable G-protein coupled receptor 142,GPR142,54437.0,2832.0,Q7Z601,Q7Z601,7.0,20.00


In [325]:
df_s["probesDrugsScore"] = df_s.apply(lambda X: X.percentage if X.score_id == 7 else None, axis=1)
df_s['probeMinerScore'] = df_s.apply(lambda X: X.percentage if X.score_id == 1 else None, axis=1)
df_s['scoreInCells'] = df_s.apply(lambda X: X.percentage if X.score_id == 5 else None, axis=1)
df_s['scoreInOrganisms'] = df_s.apply(lambda X: X.percentage if X.score_id == 6 else None, axis=1)

In [326]:
# Right now I have duplicated rows due to:
# several data sources evidencing the same assoc +
# exploded uniprot ids +
# I have to coalesce the score rows

In [327]:
df = df_s.copy()

## 5. Extract MoA

In [328]:
moas = (
    pd.read_csv(
        "/Users/irene/Documents/dev/random_notebooks/probes-drugs/dump/public.compoundaction.csv",
        sep=";",
        header=0,
        usecols=['compound_id', 'target_id', 'actiontype_id'])
)

In [329]:
df = df.merge(
        moas,
        on=['target_id', 'compound_id'],
        how='left'
)

In [330]:
df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,...,organism_id,uniprot_ids,uniprot_id,score_id,percentage,probesDrugsScore,probeMinerScore,scoreInCells,scoreInOrganisms,actiontype_id
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,5.0,66.75,,,66.75,,inhibitor
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,6.0,25.0,,,,25.0,inhibitor
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,7.0,60.0,60.0,,,,inhibitor
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,5.0,66.75,,,66.75,,
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,2832.0,Q13164,Q13164,6.0,25.0,,,,25.0,


## 6. Map to targetId and drugId

In [331]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (SparkSession.builder
    .appName('spark')
    .getOrCreate())

In [332]:
target_idx = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/21.06.5/targets')
    .select(col('id').alias('targetId'), col('proteinAnnotations.id').alias('uniprot_id'))
    .toPandas()
)

drug_idx = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/21.06.5/molecule')
    .select(col('id').alias('drugId'), 'inchiKey')
    .toPandas()
)

In [333]:
df = (
    # Get targetId
    df.merge(
        target_idx,
        on='uniprot_id',
        how='left')
    # Get drugId
    .merge(
        drug_idx,
        on='inchiKey',
        how='left')
)


df.head()

Unnamed: 0,compound_id,source,source_url,probe_id,origin,control,compound_name,inchiKey,basetarget_id,target_name,...,uniprot_id,score_id,percentage,probesDrugsScore,probeMinerScore,scoreInCells,scoreInOrganisms,actiontype_id,targetId,drugId
0,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,5.0,66.75,,,66.75,,inhibitor,ENSG00000166484,
1,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,6.0,25.0,,,,25.0,inhibitor,ENSG00000166484,
2,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,7.0,60.0,60.0,,,,inhibitor,ENSG00000166484,
3,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,5.0,66.75,,,66.75,,,ENSG00000166484,
4,5105,Chemical Probes.org,http://www.chemicalprobes.org/,14769.0,experimental,,XMD8-92,QAPAJIZPZGWAND-UHFFFAOYSA-N,15105.0,Mitogen-activated protein kinase 7,...,Q13164,6.0,25.0,,,,25.0,,ENSG00000166484,


In [334]:
df = df.astype({
    'probe_id': 'Int64',
    'basetarget_id': 'Int64',
    'target_id': 'Int64',
    'organism_id': 'Int64',
    'score_id': 'Int64'
})

In [335]:
df.to_csv('output/chemprobes_v1.tsv', sep='\t', header=True, index=False)

In [355]:
df.iloc[0]

compound_id                                       5105
source                             Chemical Probes.org
source_url              http://www.chemicalprobes.org/
probe_id                                         14769
origin                                    experimental
control                                            NaN
compound_name                                  XMD8-92
inchiKey                   QAPAJIZPZGWAND-UHFFFAOYSA-N
basetarget_id                                    15105
target_name         Mitogen-activated protein kinase 7
target_symbols                                   MAPK7
target_id                                        66118
organism_id                                       2832
uniprot_ids                                     Q13164
uniprot_id                                      Q13164
score_id                                             5
percentage                                       66.75
probesDrugsScore                                   NaN
probeMiner

## 7. Build final dataset

In [336]:
data = spark.read.csv('output/chemprobes_v1.tsv', sep='\t', header=True)

In [376]:
data_m = (data
    # Drop columns that are not needed
    .drop('compound_id', 'basetarget_id', 'target_id', 'organism_id', 'score_id', 'percentage', 'uniprot_ids')
    # Group data and collect sets
    .groupBy('probe_id', 'targetId')
    .agg(
        collect_set(
            struct(
                col('source'),
                col('source_url').alias('url'))
        ).alias('urls'),
        flatten(collect_set(array("origin"))).alias('origin'),
        flatten(collect_set(array("control"))).alias('control'),
        flatten(collect_set(array("probesDrugsScore"))).alias('probesDrugsScore'),
        flatten(collect_set(array("probeMinerScore"))).alias('probeMinerScore'),
        flatten(collect_set(array("scoreInCells"))).alias('scoreInCells'),
        flatten(collect_set(array("scoreInOrganisms"))).alias('scoreInOrganisms'),
        flatten(collect_set(array("actiontype_id"))).alias('mechanismOfAction'),
        flatten(collect_set(array("inchiKey"))).alias('inchiKey'),
        flatten(collect_set(array("drugId"))).alias('drugId'),
        flatten(collect_set(array("uniprot_id"))).alias('targetFromSourceId')
    )
    .withColumn('origin', explode(col('origin')))
    .withColumn('control', explode(col('control')))
    .withColumn('probesDrugsScore', col('probesDrugsScore').getItem(0))
    .withColumn('probeMinerScore', col('probeMinerScore').getItem(0))
    .withColumn('scoreInCells', col('scoreInCells').getItem(0))
    .withColumn('scoreInOrganisms', col('scoreInOrganisms').getItem(0))
    .withColumn('mechanismOfAction', explode(col('mechanismOfAction')))
    .withColumn('inchiKey', explode(col('inchiKey')))
    .withColumn('drugId', explode(col('drugId')))
    .withColumn('targetFromSourceId', explode(col('targetFromSourceId')))
    
    
)

In [378]:
data_m.show(1, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------
 probe_id           | 14522                                                                                         
 targetId           | ENSG00000169925                                                                               
 urls               | [{Chemical Probes.org, http://www.chemicalprobes.org/}, {High-quality chemical probes, null}] 
 origin             | experimental                                                                                  
 control            | null                                                                                          
 probesDrugsScore   | 20.0                                                                                          
 probeMinerScore    | null                                                                                          
 scoreInCells       | 100.0                                     

In [380]:
data_m.coalesce(1).write.json('output/chemicalprobes_v1')