# ToxCast exploratory analysis

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/external/toxcast.csv")

In [3]:
df.shape

(8597, 618)

In [4]:
df.head()

Unnamed: 0,smiles,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellLoss_24h_dn,APR_HepG2_CellLoss_72h_dn,APR_HepG2_MicrotubuleCSK_24h_dn,APR_HepG2_MicrotubuleCSK_24h_up,...,Tanguay_ZF_120hpf_OTIC_up,Tanguay_ZF_120hpf_PE_up,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up
0,[O-][N+](=O)C1=CC=C(Cl)C=C1,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C,,,,,,,,,,...,,,,,,,,,,
2,CN1CCN(CC1)C(=O)C1CCCCC1,,,,,,,,,,...,,,,,,,,,,
3,NC1=CC=C(C=C1)[N+]([O-])=O,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OC1=CC=C(C=C1)[N+]([O-])=O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Missingness

In [5]:
# Calculate missingness for each variable
row_missing = df.isna().sum() / df.shape[0]

# Get missingness summary statistics
median = row_missing.median()
Q1 = row_missing.quantile(0.25)
Q3 = row_missing.quantile(0.75)
IQR = Q3 - Q1
print(f'Median: {median}')
print(f'Interquartile range: {IQR}')

Median: 0.8319181109689426
Interquartile range: 0.36256833779225317


In [6]:
# Calculate missingness for each observation
col_missing = df.isna().sum(axis=1) / df.shape[1]

# Get missingness summary statistics
median = col_missing.median()
Q1 = col_missing.quantile(0.25)
Q3 = col_missing.quantile(0.75)
IQR = Q3 - Q1
print(f'Median: {median}')
print(f'Interquartile range: {IQR}')

Median: 0.8365695792880259
Interquartile range: 0.24757281553398058


# Variable values

In [8]:
# Check the number of unique values for each variable
unique_values = df.drop('smiles', axis=1).nunique()

print(unique_values)

ACEA_T47D_80hr_Negative             2
ACEA_T47D_80hr_Positive             2
APR_HepG2_CellCycleArrest_24h_dn    2
APR_HepG2_CellCycleArrest_24h_up    2
APR_HepG2_CellCycleArrest_72h_dn    2
                                   ..
Tanguay_ZF_120hpf_SOMI_up           2
Tanguay_ZF_120hpf_SWIM_up           2
Tanguay_ZF_120hpf_TRUN_up           2
Tanguay_ZF_120hpf_TR_up             2
Tanguay_ZF_120hpf_YSE_up            2
Length: 617, dtype: int64


In [10]:
# Check the set of outcome values for each variable
print(unique_values.unique())

[2]


Value for each variable is binary (0=non-toxic 1=non-toxic)