In [112]:
import pandas as pd
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = (SparkSession.builder
                .appName('spark')
                .getOrCreate())

In [122]:
df = pd.read_csv("experimental-toxicity-data-2020-03-30.tsv", sep="\t")
tox21_assays = pd.read_csv("Assay_Information_August_2020/assay_annotation.tsv", sep="\t")

In [44]:
df.head()

Unnamed: 0,ensembl_gene_id,tissue,cell_format,assay_format_type,cell_short_name,assay_description,data_source,data_source_reference_link
0,ENSG00000091831,,cell line,cell-based,T47D,"ACEA_ER is a cell-based, single-readout assay ...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
1,ENSG00000141510,,cell line,cell-based,HepG2,"APR_HepG2_1hr is a cell-based, multiplexed-rea...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
2,ENSG00000120738,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
3,ENSG00000091831,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
4,ENSG00000169856,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...


In [123]:
tox21_assays.head()

Unnamed: 0,aid,asid,assay_name,assay_desc,timepoint_hr,organism_id,organism,tissue,cell_format,cell_free_component_source,cell_short_name,cell_growth_mode,assay_footprint,assay_format_type,assay_format_type_sub,content_readout_type,dilution_solvent,dilution_solvent_percent_max
0,1,1,ACEA_ER,"ACEA_ER is a cell-based, single-readout assay ...",80.0,9606.0,human,breast,cell line,,T47D,adherent,microplate: 384-well plate,cell-based,cell-based format,single,DMSO,0.5
1,2,2,APR_HepG2_1hr,"APR_HepG2_1hr is a cell-based, multiplexed-rea...",1.0,9606.0,human,liver,cell line,,HepG2,adherent,microplate: 384-well plate,cell-based,cell-based format,multiplexed,DMSO,0.5
2,3,2,APR_HepG2_24hr,"APR_HepG2_24hr is a cell-based, multiplexed-re...",24.0,9606.0,human,liver,cell line,,HepG2,adherent,microplate: 384-well plate,cell-based,cell-based format,multiplexed,DMSO,0.5
3,4,2,APR_HepG2_72hr,"APR_HepG2_72hr is a cell-based, multiplexed-re...",72.0,9606.0,human,liver,cell line,,HepG2,adherent,microplate: 384-well plate,cell-based,cell-based format,multiplexed,DMSO,0.5
4,5,3,ATG_CIS,"ATG_CIS is a cell-based, multiplexed-readout a...",24.0,9606.0,human,liver,cell line,,HepG2,adherent,microplate: 24-well plate,cell-based,cell-based format,multiplexed,DMSO,1.0


## 1. Merge tox21 with our dataset to fetch assay_name

In [304]:
# Filtering out 29 bioassays coming from etox
df = df.merge(tox21_assays[["assay_desc", "assay_name", "aid"]], left_on="assay_description", right_on="assay_desc", how="inner")

In [125]:
df.iloc[0]

ensembl_gene_id                                                 ENSG00000091831
tissue                                                                      NaN
cell_format                                                           cell line
assay_format_type                                                    cell-based
cell_short_name                                                            T47D
assay_description             ACEA_ER is a cell-based, single-readout assay ...
data_source                                                               Tox21
data_source_reference_link    https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
assay_desc                    ACEA_ER is a cell-based, single-readout assay ...
assay_name                                                              ACEA_ER
Name: 0, dtype: object

## 2. Map to assay target

In [49]:
# Potentially the second element in the assay name looks to be the target symbol, let's see how true this is

df["potential_target"] = df["assay_name"].apply(lambda X: X.split("_")[1])

In [50]:
df.iloc[0]

ensembl_gene_id                                                 ENSG00000091831
tissue                                                                      NaN
cell_format                                                           cell line
assay_format_type                                                    cell-based
cell_short_name                                                            T47D
assay_description             ACEA_ER is a cell-based, single-readout assay ...
data_source                                                               Tox21
data_source_reference_link    https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...
assay_desc                    ACEA_ER is a cell-based, single-readout assay ...
assay_name                                                              ACEA_ER
potential_target                                                             ER
Name: 0, dtype: object

## 2.1 Extract potential_target from the assay_name

In [54]:
genes = spark.read.json("../21.02_gene-data.json").select("id", "approved_symbol", "symbol_synonyms", "name_synonyms")
genes.show(1, False, True)

-RECORD 0---------------------------------------------------------------------
 id              | ENSG00000094914                                            
 approved_symbol | AAAS                                                       
 symbol_synonyms | [ADRACALA]                                                 
 name_synonyms   | [Adracalin, Allgrove, triple-A, adracalin, aladin, Aladin] 
only showing top 1 row



In [56]:
# Explode everything
genes = (genes
    .withColumn("symbol_synonyms", explode(col("symbol_synonyms")))
    .withColumn("name_synonyms", explode(col("name_synonyms")))
)

In [60]:
#genes = genes.toPandas()

df.shape

(452, 11)

In [84]:
# How good does it merge using the approved_symbol? 442 out of 452 are excluded. Only 1 out of 47 ("AR") matches

(df
 .merge(genes, left_on="potential_target", right_on="approved_symbol", how="left", indicator=True).query('_merge == "left_only"')
).head()

Unnamed: 0,ensembl_gene_id,tissue,cell_format,assay_format_type,cell_short_name,assay_description,data_source,data_source_reference_link,assay_desc,assay_name,potential_target,id,approved_symbol,symbol_synonyms,name_synonyms,_merge
0,ENSG00000091831,,cell line,cell-based,T47D,"ACEA_ER is a cell-based, single-readout assay ...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...,"ACEA_ER is a cell-based, single-readout assay ...",ACEA_ER,ER,,,,,left_only
1,ENSG00000141510,,cell line,cell-based,HepG2,"APR_HepG2_1hr is a cell-based, multiplexed-rea...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...,"APR_HepG2_1hr is a cell-based, multiplexed-rea...",APR_HepG2_1hr,HepG2,,,,,left_only
2,ENSG00000120738,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...,"ATG_CIS is a cell-based, multiplexed-readout a...",ATG_CIS,CIS,,,,,left_only
3,ENSG00000091831,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...,"ATG_CIS is a cell-based, multiplexed-readout a...",ATG_CIS,CIS,,,,,left_only
4,ENSG00000169856,,cell line,cell-based,HepG2,"ATG_CIS is a cell-based, multiplexed-readout a...",Tox21,https://ntp.niehs.nih.gov/whatwestudy/tox21/ph...,"ATG_CIS is a cell-based, multiplexed-readout a...",ATG_CIS,CIS,,,,,left_only


### "potential_target" from the assay_name is not a good approach

## 2.2 Extract target from the assay components annotation

### 2.2.1. Extract target symbol

In [301]:
tox21_components = pd.read_csv("Assay_Information_August_2020/assay_component_annotation.tsv", sep="\t")
tox21_components.head()

Unnamed: 0,acid,aid,assay_component_name,assay_component_desc,assay_component_target_desc,parameter_readout_type,assay_design_type,assay_design_type_sub,biological_process_target,detection_technology_type,detection_technology_type_sub,detection_technology,signal_direction_type,key_assay_reagent_type,key_assay_reagent,technological_target_type,technological_target_type_sub
0,1,1,ACEA_ER_80hr,"ACEA_ER_80hr, is one of two assay component(s)...",Electrical impedance is used to quantify chang...,single,growth reporter,real-time cell-growth kinetics,cell proliferation,Label Free Technology,Electrical Sensor: Impedance,RT-CES,both,,,cellular,cellular
1,1804,1,ACEA_ER_AUC_viability,"ACEA_ER_AUC_viability, is one of two assay com...",Electrical impedance is used to quantify chang...,single,growth reporter,real-time cell-growth kinetics,cell proliferation,Label Free Technology,Electrical Sensor: Impedance,RT-CES,both,,,cellular,cellular
2,2,2,APR_HepG2_CellCycleArrest_1hr,"APR_HepG2_CellCycleArrest_1hr, is one of 10 as...",Hoechst-33342 dye is used as a stain for DNA t...,single,morphology reporter,cell phenotype,cell cycle,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,stain,Hoechst-33342 dye,dna,dna-unspecified
3,3,2,APR_HepG2_CellLoss_1hr,"APR_HepG2_CellLoss_1hr, is one of 10 assay com...",Hoechst-33342 dye is used as a stain for DNA t...,single,viability reporter,cell number,cell death,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,stain,Hoechst-33342 dye,dna,dna-unspecified
4,4,2,APR_HepG2_MicrotubuleCSK_1hr,"APR_HepG2_MicrotubuleCSK_1hr, is one of 10 ass...",anti-a-tubulin antibody is used to tag and qua...,single,conformation reporter,protein conformation,protein stabilization,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-a-tubulin antibody,protein,protein-specified


In [280]:
# Some of the components have the annotated target under assay_component_target_desc

tox21_components["assay_component_target_desc"][4]

'anti-a-tubulin antibody is used to tag and quantify the level of tubulin, alpha 1a protein. Changes in the signals are indicative of protein expression changes as a cellular response to stress in the system [GeneSymbol:TUBA1A | GeneID:7846 | Uniprot_SwissProt_Accession:Q71U36].'

In [281]:
# Do we have targets for all assay ids (aid)?

tox21_targets = tox21_components[tox21_components["assay_component_target_desc"].str.contains("GeneSymbol", na=False)]

tox21_targets

Unnamed: 0,acid,aid,assay_component_name,assay_component_desc,assay_component_target_desc,parameter_readout_type,assay_design_type,assay_design_type_sub,biological_process_target,detection_technology_type,detection_technology_type_sub,detection_technology,signal_direction_type,key_assay_reagent_type,key_assay_reagent,technological_target_type,technological_target_type_sub
4,4,2,APR_HepG2_MicrotubuleCSK_1hr,"APR_HepG2_MicrotubuleCSK_1hr, is one of 10 ass...",anti-a-tubulin antibody is used to tag and qua...,single,conformation reporter,protein conformation,protein stabilization,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-a-tubulin antibody,protein,protein-specified
7,7,2,APR_HepG2_MitoticArrest_1hr,"APR_HepG2_MitoticArrest_1hr, is one of 10 assa...",anti-phospho-histone-H3 antibody is used to ta...,single,morphology reporter,cell phenotype,cell cycle,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-phospho-histone-H3 antibody,protein,protein-specified
9,9,2,APR_HepG2_OxidativeStress_1hr,"APR_HepG2_OxidativeStress_1hr, is one of 10 as...",anti-phospho-histone-H2AX antibody is used to ...,single,viability reporter,dna content,oxidative phosphorylation,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-phospho-histone-H2AX antibody,protein,protein-specified
10,10,2,APR_HepG2_p53Act_1hr,"APR_HepG2_p53Act_1hr, is one of 10 assay compo...",anti-p53 antibody is used to tag and quantify ...,single,viability reporter,dna content,cell death,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-p53 antibody,protein,protein-specified
11,11,2,APR_HepG2_StressKinase_1hr,"APR_HepG2_StressKinase_1hr, is one of 10 assay...",anti-phospho-c-jun antibody is used to tag and...,single,enzyme reporter,enzyme activity,oxidative phosphorylation,Fluorescence,Fluorescence intensity,HCS Fluorescent Imaging,both,antibody,anti-phospho-c-jun antibody,protein,protein-specified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,2329,664,NVS_ENZ_hPDE9A2,NVS_ENZ_hPDE9A is one of one assay component(s...,Changes to fluorescence intensity signals prod...,single,enzyme reporter,enzyme activity,regulation of catalytic activity,Fluorescence,Fluorescence intensity,Fluorescence and electrophoretic mobility shift,loss,substrate,Fluorescent cyclic AMP,protein,enzyme
1061,2393,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Antagonist,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Antagonist is ...,Loss of DHT-induced red fluorescence in nucleo...,single,binding reporter,protein-protein binding,receptor binding,Fluorescence,Optical microscopy: Fluorescence microscopy,HCS Fluorescent Imaging,loss,,,protein,receptor binding
1062,2394,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Agonist,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Agonist is one...,Increased red fluorescence in nucleolar region...,single,binding reporter,protein-protein binding,receptor binding,Fluorescence,Optical microscopy: Fluorescence microscopy,HCS Fluorescent Imaging,gain,,,protein,receptor binding
1063,2397,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Cytoplasm_Rati...,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Cytoplasm_Rati...,Decrease of the ratio of DHT-induced red fluor...,single,binding reporter,protein-protein binding,receptor binding,Fluorescence,Optical microscopy: Fluorescence microscopy,HCS Fluorescent Imaging,loss,,,protein,receptor binding


In [282]:
# extracting the symbols

_tox21_targets = tox21_targets["assay_component_target_desc"].str.extract('(GeneSymbol: ([^\s]+))|(GeneSymbol:([^\s]+))', expand=False)

_tox21_targets

Unnamed: 0,0,1,2,3
4,,,GeneSymbol:TUBA1A,TUBA1A
7,,,GeneSymbol:H3F3A,H3F3A
9,,,GeneSymbol:H2AFX,H2AFX
10,,,GeneSymbol:TP53,TP53
11,,,GeneSymbol:JUN,JUN
...,...,...,...,...
1029,,,GeneSymbol:PDE4D,PDE4D
1061,GeneSymbol: AR,AR,,
1062,GeneSymbol: AR,AR,,
1063,GeneSymbol: AR,AR,,


In [283]:
_tox21_targets["assay_target_symbol"] = _tox21_targets[3].combine_first(_tox21_targets[1]).apply(lambda X: X.strip("]"))
_tox21_targets.head()

Unnamed: 0,0,1,2,3,assay_target_symbol
4,,,GeneSymbol:TUBA1A,TUBA1A,TUBA1A
7,,,GeneSymbol:H3F3A,H3F3A,H3F3A
9,,,GeneSymbol:H2AFX,H2AFX,H2AFX
10,,,GeneSymbol:TP53,TP53,TP53
11,,,GeneSymbol:JUN,JUN,JUN


In [284]:
tox21_targets = pd.concat([tox21_targets, _tox21_targets["assay_target_symbol"]], axis=1)

tox21_targets["assay_target_symbol"].value_counts()

ESR1     14
AR       13
NR1I3     6
CCL2      6
RARA      6
         ..
PDE5A     1
ADRB2     1
OPRM1     1
IL6       1
MAPK9     1
Name: assay_target_symbol, Length: 348, dtype: int64

### 2.2.2. Extract Ensembl ID

In [289]:
genes["approved_symbol"] = genes["approved_symbol"].apply(lambda X: X.lower())
genes["symbol_synonyms"] = genes["symbol_synonyms"].apply(lambda X: X.lower())
tox21_targets["assay_target_symbol"] = tox21_targets["assay_target_symbol"].apply(lambda X: X.lower())

In [286]:
tox21_targets = tox21_targets \
                    .merge(
                        genes[["approved_symbol", "id"]].drop_duplicates(),
                        how="left",
                        left_on="assay_target_symbol",
                        right_on="approved_symbol",
                        indicator=True)
tox21_targets.head()

Unnamed: 0,acid,aid,assay_component_name,assay_component_desc,assay_component_target_desc,parameter_readout_type,assay_design_type,assay_design_type_sub,biological_process_target,detection_technology_type,...,detection_technology,signal_direction_type,key_assay_reagent_type,key_assay_reagent,technological_target_type,technological_target_type_sub,assay_target_symbol,approved_symbol,id,_merge
0,4,2,APR_HepG2_MicrotubuleCSK_1hr,"APR_HepG2_MicrotubuleCSK_1hr, is one of 10 ass...",anti-a-tubulin antibody is used to tag and qua...,single,conformation reporter,protein conformation,protein stabilization,Fluorescence,...,HCS Fluorescent Imaging,both,antibody,anti-a-tubulin antibody,protein,protein-specified,tuba1a,tuba1a,ENSG00000167552,both
1,7,2,APR_HepG2_MitoticArrest_1hr,"APR_HepG2_MitoticArrest_1hr, is one of 10 assa...",anti-phospho-histone-H3 antibody is used to ta...,single,morphology reporter,cell phenotype,cell cycle,Fluorescence,...,HCS Fluorescent Imaging,both,antibody,anti-phospho-histone-H3 antibody,protein,protein-specified,h3f3a,,,left_only
2,9,2,APR_HepG2_OxidativeStress_1hr,"APR_HepG2_OxidativeStress_1hr, is one of 10 as...",anti-phospho-histone-H2AX antibody is used to ...,single,viability reporter,dna content,oxidative phosphorylation,Fluorescence,...,HCS Fluorescent Imaging,both,antibody,anti-phospho-histone-H2AX antibody,protein,protein-specified,h2afx,,,left_only
3,10,2,APR_HepG2_p53Act_1hr,"APR_HepG2_p53Act_1hr, is one of 10 assay compo...",anti-p53 antibody is used to tag and quantify ...,single,viability reporter,dna content,cell death,Fluorescence,...,HCS Fluorescent Imaging,both,antibody,anti-p53 antibody,protein,protein-specified,tp53,tp53,ENSG00000141510,both
4,11,2,APR_HepG2_StressKinase_1hr,"APR_HepG2_StressKinase_1hr, is one of 10 assay...",anti-phospho-c-jun antibody is used to tag and...,single,enzyme reporter,enzyme activity,oxidative phosphorylation,Fluorescence,...,HCS Fluorescent Imaging,both,antibody,anti-phospho-c-jun antibody,protein,protein-specified,jun,jun,ENSG00000177606,both


In [287]:
tox21_targets._merge.value_counts()

both          421
left_only      66
right_only      0
Name: _merge, dtype: int64

In [290]:
tox21_targets[tox21_targets["_merge"] == "left_only"]["assay_target_symbol"].unique()

array(['h3f3a', 'h2afx', 'mtf1', 'sox1', 'plat', 'col3a1', 'cd38',
       'cyp2j2', 'cyp4f12', 'cyp2a1', 'cyp2a2', 'cyp2b1', 'cyp2c11',
       'cyp2c12', 'cyp2c13', 'cyp2c6v1', 'cyp2d1', 'cyp2d2',
       'cyp3a23/3a1', 'cyp3a2', 'ache', 'gsk3b', 'lck', 'pde10a',
       'prkag1', 'syk', 'mthfr', 'comt', 'maoa', 'maob', 'drd1', 'hrh1',
       'hrh2', 'chrm3', 'drd2', 'drd4', 'chrm1', 'chrm2', 'chrm4',
       'chrm5', 'npy2r', 'tbxa2r', 'adra1b', 'oxtr', 'sstr1', 'trhr',
       'gabra5', 'chrna2', 'gabra6', 'glra1', 'nr1f3', 'nr3c4'],
      dtype=object)

In [293]:
# Weird: some targets preset in the index are not in the dataset

genes.query('id == "ENSG00000196639"')

Unnamed: 0,id,approved_symbol,symbol_synonyms,name_synonyms


In [296]:
'''Merge also with synonyms

tox21_targets_w_syn = tox21_targets[tox21_targets["id"].isna()] \
                    .merge(
                        genes[["symbol_synonyms", "id"]].drop_duplicates(),
                        how="left",
                        left_on="assay_target_symbol",
                        right_on="symbol_synonyms") \
                    .dropna(subset=["symbol_synonyms"])

tox21_targets_w_syn
'''
;

''

In [299]:
tox21_targets = tox21_targets[["acid", "aid", "assay_component_name", "assay_component_desc", "assay_component_target_desc", "biological_process_target", "approved_symbol", "id"]]

In [300]:
# I'll only work with those who have an ENSID
tox21_targets.dropna(inplace=True, subset=["id"])
tox21_targets = tox21_targets[["acid", "aid", "assay_component_name", "assay_component_desc", "assay_component_target_desc", "biological_process_target", "approved_symbol", "id"]]
tox21_targets.head()

Unnamed: 0,acid,aid,assay_component_name,assay_component_desc,assay_component_target_desc,biological_process_target,approved_symbol,id
0,4,2,APR_HepG2_MicrotubuleCSK_1hr,"APR_HepG2_MicrotubuleCSK_1hr, is one of 10 ass...",anti-a-tubulin antibody is used to tag and qua...,protein stabilization,tuba1a,ENSG00000167552
3,10,2,APR_HepG2_p53Act_1hr,"APR_HepG2_p53Act_1hr, is one of 10 assay compo...",anti-p53 antibody is used to tag and quantify ...,cell death,tp53,ENSG00000141510
4,11,2,APR_HepG2_StressKinase_1hr,"APR_HepG2_StressKinase_1hr, is one of 10 assay...",anti-phospho-c-jun antibody is used to tag and...,oxidative phosphorylation,jun,ENSG00000177606
5,14,3,APR_HepG2_MicrotubuleCSK_24hr,"APR_HepG2_MicrotubuleCSK_24hr, is one of 10 as...",anti-a-tubulin antibody is used to tag and qua...,protein stabilization,tuba1a,ENSG00000167552
8,20,3,APR_HepG2_p53Act_24hr,"APR_HepG2_p53Act_24hr, is one of 10 assay comp...",anti-p53 antibody is used to tag and quantify ...,cell death,tp53,ENSG00000141510


### 2.2.3. Do we have targets for all assay ids (aid)?

In [302]:
# Doesn't look bad! assay #1 is ER
tox21_targets.aid.unique()

array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  28,
        29,  31,  32,  42,  45,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  81,
        82,  83,  84,  85,  86,  87,  88,  90,  91,  92,  93,  94,  95,
        96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
       110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
       123, 124, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
       137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 150, 151,
       153, 155, 160, 161, 164, 165, 166, 167, 169, 170, 172, 173, 174,
       175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 189, 190,
       192, 198, 199, 201, 202, 203, 204, 205, 207, 208, 209, 210, 211,
       212, 213, 214, 215, 217, 218, 219, 220, 221, 222, 223, 22

In [307]:
# We don't have the annotation for all assay ids... any manual curation?

df.aid.unique()

array([  1,   2,   5,   6,   7,   8,  10, 106,  29, 110,  72, 112,  13,
        75,  80,  45,  48,  88, 128, 135,  15,  16, 137, 103,  24,  64,
        66,  27,  67, 312, 313, 379, 400, 249, 399, 173, 174, 177, 180,
       264, 265, 182, 370, 284, 298, 404, 442, 408, 574, 401,   9,  11,
       107,  12,  30, 111,  76,  79,  46, 117,  14,  89,  90,  53,  93,
       120,  60,  99, 122, 123, 310, 237, 263, 271, 273, 280, 193, 198,
       282, 283, 200, 203, 292, 293, 443, 405, 534, 582, 430, 109, 115,
       140, 141, 124,  82, 133, 118,  59,  61,  22,  23, 102,  68, 311,
       272, 191, 302, 306, 402, 579, 114, 116,  91,  52,  92, 104, 376,
       317, 320, 268, 181, 184, 270, 274, 328, 189, 202, 207, 297, 307,
       403, 441, 578, 583,  71,  73, 113, 126, 131, 132,  85,  49, 136,
        96,  18,  57,  21, 101,  26, 146, 309, 375, 319, 321, 266, 325,
       185, 186, 269, 188, 192, 285, 294, 372,   3,  81,  86,  47,  50,
        54,  55, 130,  17,  97,  62,  28, 377, 155, 175, 178, 27

In [312]:
# Let's compare the target annotation

tox21_targets = tox21_targets.merge(df[["aid", "ensembl_gene_id"]], on="aid", how="left", indicator=True)

Unnamed: 0,acid,aid,assay_component_name,assay_component_desc,assay_component_target_desc,biological_process_target,approved_symbol,id,ensembl_gene_id,_merge
0,4,2,APR_HepG2_MicrotubuleCSK_1hr,"APR_HepG2_MicrotubuleCSK_1hr, is one of 10 ass...",anti-a-tubulin antibody is used to tag and qua...,protein stabilization,tuba1a,ENSG00000167552,ENSG00000141510,both
1,10,2,APR_HepG2_p53Act_1hr,"APR_HepG2_p53Act_1hr, is one of 10 assay compo...",anti-p53 antibody is used to tag and quantify ...,cell death,tp53,ENSG00000141510,ENSG00000141510,both
2,11,2,APR_HepG2_StressKinase_1hr,"APR_HepG2_StressKinase_1hr, is one of 10 assay...",anti-phospho-c-jun antibody is used to tag and...,oxidative phosphorylation,jun,ENSG00000177606,ENSG00000141510,both
3,14,3,APR_HepG2_MicrotubuleCSK_24hr,"APR_HepG2_MicrotubuleCSK_24hr, is one of 10 as...",anti-a-tubulin antibody is used to tag and qua...,protein stabilization,tuba1a,ENSG00000167552,ENSG00000141510,both
4,20,3,APR_HepG2_p53Act_24hr,"APR_HepG2_p53Act_24hr, is one of 10 assay comp...",anti-p53 antibody is used to tag and quantify ...,cell death,tp53,ENSG00000141510,ENSG00000141510,both
...,...,...,...,...,...,...,...,...,...,...
3123,2329,664,NVS_ENZ_hPDE9A2,NVS_ENZ_hPDE9A is one of one assay component(s...,Changes to fluorescence intensity signals prod...,regulation of catalytic activity,pde4d,ENSG00000113448,,left_only
3124,2393,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Antagonist,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Antagonist is ...,Loss of DHT-induced red fluorescence in nucleo...,receptor binding,ar,ENSG00000169083,,left_only
3125,2394,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Agonist,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Agonist is one...,Increased red fluorescence in nucleolar region...,receptor binding,ar,ENSG00000169083,,left_only
3126,2397,681,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Cytoplasm_Rati...,UPITT_HCI_U2OS_AR_TIF2_Nucleoli_Cytoplasm_Rati...,Decrease of the ratio of DHT-induced red fluor...,receptor binding,ar,ENSG00000169083,,left_only


In [317]:
# Target annotation for aid = 2

with pd.option_context('display.max_colwidth', None):
    print(tox21_targets.query('_merge == "both"')["assay_component_target_desc"][:3])

0                     anti-a-tubulin antibody is used to tag and quantify the level of tubulin, alpha 1a protein. Changes in the signals are indicative of protein expression changes as a cellular response to stress in the system [GeneSymbol:TUBA1A | GeneID:7846 | Uniprot_SwissProt_Accession:Q71U36].
1                             anti-p53 antibody is used to tag and quantify the level of tumor protein p53 protein. Changes in the signals are indicative of protein expression changes as a cellular response to stress in the system [GeneSymbol:TP53 | GeneID:7157 | Uniprot_SwissProt_Accession:P04637].
2    anti-phospho-c-jun antibody is used to tag and quantify the level of phosphorylated jun proto-oncogene protein. Changes in the signals are indicative of protein expression changes as a cellular response to stress in the system [GeneSymbol:JUN | GeneID:3725 | Uniprot_SwissProt_Accession:P05412].
Name: assay_component_target_desc, dtype: object


### The extracted target seems to be accurate but I have more annotation than in OT's experimental toxicity data