In [1]:
# Notebook: 01_build_standardized_tox21_results.ipynb

import os
import zipfile
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import RDLogger

# silence RDKit chatter (standardization uses RDKit under the hood)
RDLogger.DisableLog('rdApp.*')

# 👉 paths (edit as needed)
download_dir = "Downloaded Files"   # folder containing the original Tox21 zip files
output_excel = os.path.join("data", "StandardizedTox21Results.xlsx")

# make sure output dir exists
os.makedirs(os.path.dirname(output_excel), exist_ok=True)

# allowed assay shortnames to keep (lowercase)
allowed_assays = [
    'ar-bla-agonist-p1','ar-bla-antagonist-p1','car-agonist-p1','car-antagonist-p1',
    'er-bla-agonist-p2','er-bla-antagonist-p1','erb-bla-antagonist-p1','erb-bla-p1',
    'fxr-bla-agonist-p2','fxr-bla-antagonist-p1','gr-hela-bla-agonist-p1','gr-hela-bla-antagonist-p1',
    'ppard-bla-agonist-p1','ppard-bla-antagonist-p1','pparg-bla-agonist-p1','pparg-bla-antagonist-p1',
    'pr-bla-agonist-p1','pr-bla-antagonist-p1','pxr-p1','rar-agonist-p1',
    'rar-antagonist-p2','ror-cho-antagonist-p1','rxr-bla-agonist-p1',
    'vdr-bla-agonist-p1','vdr-bla-antagonist-p1'
]
allowed_assays_lower = [a.lower() for a in allowed_assays]

In [2]:
def drop_trailing_unnamed(df: pd.DataFrame) -> pd.DataFrame:
    """Drop the last column if it's Unnamed or entirely NaN."""
    if df.shape[1] == 0:
        return df
    last = df.columns[-1]
    if str(last).startswith("Unnamed") or df.iloc[:, -1].isna().all():
        return df.iloc[:, :-1]
    return df

def unanimous_vote(series: pd.Series) -> str:
    """
    Ignore null/NaN and 'inconclusive' (case-insensitive).
    If remaining values are all the same → return that value, else 'Inconclusive'.
    """
    valid = series.dropna().astype(str)
    valid = valid[~valid.str.lower().isin(["inconclusive", ""])]
    if valid.empty:
        return "Inconclusive"
    uniq = valid.unique()
    return uniq[0] if len(uniq) == 1 else "Inconclusive"

def aggregate_sample_group(group: pd.DataFrame) -> pd.Series:
    """
    Aggregate rows that share (SAMPLE_ID, SAMPLE_NAME):
    - 'result' via unanimous_vote
    - assert SMILES consistency (non-null values must match)
    """
    agg_result = unanimous_vote(group["result"])
    smiles_series = group["SMILES"].dropna()
    if not smiles_series.empty and smiles_series.nunique() > 1:
        raise AssertionError(
            f"Inconsistent SMILES for SAMPLE_ID {group['SAMPLE_ID'].iloc[0]}, "
            f"SAMPLE_NAME {group['SAMPLE_NAME'].iloc[0]}: {smiles_series.unique()}"
        )
    agg_smiles = smiles_series.iloc[0] if not smiles_series.empty else np.nan
    return pd.Series({"result": agg_result, "SMILES": agg_smiles})

def assay_name_from_zip(filename: str) -> str:
    """
    Derive an assay shortname column from a tox21 zip filename.
    Example: 'tox21-pparg-bla-agonist-p1_...zip' → 'pparg-bla-agonist-p1'
    """
    base = os.path.splitext(filename)[0]
    base = base.replace("tox21-", "").strip()
    # cut at first underscore if present
    return base.split("_")[0]

def read_aggregated_from_zip(zip_path: str) -> pd.DataFrame | None:
    """
    Read 'aggregated.txt' (or common misspelling 'aggregrated.txt') from a tox21 zip,
    return an activity-only DataFrame with unified 'result' values.
    """
    with zipfile.ZipFile(zip_path, "r") as zf:
        agg_member = None
        for name in zf.namelist():
            low = name.lower()
            if "aggregated.txt" in low or "aggregrated.txt" in low:
                agg_member = name
                break
        if agg_member is None:
            print(f"[skip] No aggregated file inside: {os.path.basename(zip_path)}")
            return None

        with zf.open(agg_member) as f:
            df = pd.read_csv(f, sep="\t", header=0, index_col=False)
            df = drop_trailing_unnamed(df)

    # activity rows only
    if "SAMPLE_DATA_TYPE" not in df.columns:
        print(f"[skip] Missing SAMPLE_DATA_TYPE in {os.path.basename(zip_path)}")
        return None
    df = df[df["SAMPLE_DATA_TYPE"] == "activity"].copy()
    if df.empty:
        print(f"[skip] No activity rows in {os.path.basename(zip_path)}")
        return None

    # sanity checks
    if df["SAMPLE_NAME"].isnull().any():
        raise ValueError(f"Null SAMPLE_NAME in {os.path.basename(zip_path)}")
    for sample_id, g in df.groupby("SAMPLE_ID"):
        if g["SAMPLE_NAME"].nunique() > 1:
            raise ValueError(
                f"Inconsistent SAMPLE_NAME for SAMPLE_ID {sample_id} in {os.path.basename(zip_path)}"
            )

    # map per-row result
    conditions = [
        (df["ASSAY_OUTCOME"] == "inactive") & (df["REPRODUCIBILITY"] == "inactive_match"),
        (df["ASSAY_OUTCOME"] == "active agonist") & (df["REPRODUCIBILITY"] == "active_match"),
        (df["ASSAY_OUTCOME"] == "active antagonist") & (df["REPRODUCIBILITY"] == "active_match"),
    ]
    choices = ["Inactive", "Agonist", "Antagonist"]
    df["result"] = np.select(conditions, choices, default="Inconclusive")

    # group to one row per (SAMPLE_ID, SAMPLE_NAME)
    collapsed = df.groupby(["SAMPLE_ID", "SAMPLE_NAME"], as_index=False).apply(aggregate_sample_group)
    return collapsed.reset_index(drop=True)

In [3]:
# collect and merge in-memory
zip_files = [f for f in os.listdir(download_dir) if f.lower().endswith(".zip")]
if not zip_files:
    raise FileNotFoundError(f"No .zip files found in '{download_dir}'")

master_df = None

for fname in tqdm(zip_files, desc="Reading Tox21 zip files"):
    zpath = os.path.join(download_dir, fname)
    df_assay = read_aggregated_from_zip(zpath)
    if df_assay is None or df_assay.empty:
        continue

    # rename 'result' -> assay column
    assay_col = assay_name_from_zip(fname)
    df_assay = df_assay.rename(columns={"result": assay_col})

    # merge into master on sample keys
    if master_df is None:
        master_df = df_assay.copy()
    else:
        master_df = master_df.merge(
            df_assay, on=["SAMPLE_ID", "SAMPLE_NAME"], how="outer", suffixes=("", "_new")
        )
        # prefer existing SMILES, fill missing with new
        if "SMILES_new" in master_df.columns:
            master_df["SMILES"] = master_df["SMILES"].combine_first(master_df["SMILES_new"])
            master_df.drop(columns=["SMILES_new"], inplace=True)

if master_df is None or master_df.empty:
    raise RuntimeError("No assay data could be built from the provided zip files.")

# (optional) quick peek
master_df.head()

Reading Tox21 zip files:   7%|▋         | 6/82 [00:46<07:46,  6.14s/it]

[skip] No activity rows in tox21-err-p1.zip


Reading Tox21 zip files:  15%|█▍        | 12/82 [01:29<06:44,  5.77s/it]

[skip] No activity rows in tox21-pgc-err-p1.zip


Reading Tox21 zip files:  20%|█▉        | 16/82 [01:55<05:55,  5.39s/it]

[skip] No activity rows in tox21-dt40-p1.zip


Reading Tox21 zip files:  40%|████      | 33/82 [04:17<05:11,  6.37s/it]

[skip] No activity rows in tox21-rt-viability-hek293-p1.zip
[skip] No activity rows in tox21-trhr-hek293-p1.zip


Reading Tox21 zip files:  50%|█████     | 41/82 [05:21<05:49,  8.53s/it]

[skip] No activity rows in tox21-ms-ache-p2.zip
[skip] No activity rows in tox21-ache-p4.zip


Reading Tox21 zip files:  57%|█████▋    | 47/82 [05:58<04:22,  7.49s/it]

[skip] No activity rows in tox21-ache-p5.zip
[skip] No activity rows in tox21-ror-cho-viability-p1.zip


Reading Tox21 zip files:  67%|██████▋   | 55/82 [06:52<03:38,  8.07s/it]

[skip] No activity rows in tox21-spec-hepg2-p1.zip


Reading Tox21 zip files:  78%|███████▊  | 64/82 [08:03<02:32,  8.48s/it]

[skip] No activity rows in tox21-spec-hek293-p1.zip


Reading Tox21 zip files:  82%|████████▏ | 67/82 [08:12<01:15,  5.06s/it]

[skip] No activity rows in tox21-rt-viability-hepg2-p1.zip


Reading Tox21 zip files:  94%|█████████▍| 77/82 [09:42<00:44,  8.90s/it]

[skip] No activity rows in tox21-ache-p3.zip


Reading Tox21 zip files:  98%|█████████▊| 80/82 [10:01<00:14,  7.47s/it]

[skip] No activity rows in tox21-rar-viability-p2.zip


Reading Tox21 zip files: 100%|██████████| 82/82 [10:10<00:00,  7.44s/it]


Unnamed: 0,SAMPLE_ID,SAMPLE_NAME,er-bla-agonist-p2,SMILES,p53-bla-p7,car-agonist-p1,ap1-agonist-p1,er-luc-bg1-4e2-antagonist-p1,sbe-bla-antagonist-p1,p450-2c19-p1,...,vdr-bla-agonist-p1,ppard-bla-agonist-p1,vdr-bla-antagonist-p1,p53-bla-p1,er-luc-bg1-4e2-agonist-p2,hre-bla-agonist-p1,rar-antagonist-p2,ar-bla-agonist-p1,hdac-p1,tshr-wt-p1
0,NCGC00013012-01,Calcium 5-ketogluconate,Inactive,[Ca+2].O=C([O-])[C@H](O)[C@@H](O)[C@H](O)C(=O)...,,,,Inactive,,,...,,,,Inactive,Inactive,,,Inactive,,
1,NCGC00013015-01,"D-Gluconic acid, barium salt",Inactive,[Ba+2].O[C@H]([C@@H](O)C([O-])=O)[C@H](O)[C@H]...,,,,Inactive,,,...,,,,Inactive,Inactive,,,Inactive,,
2,NCGC00013034-01,Estradiol cypionate,Agonist,Oc4cc5CC[C@@H]1[C@H](CC[C@]2(C)[C@H](CC[C@@H]1...,,,,Inactive,,,...,,,,Inactive,Inconclusive,,,Agonist,,
3,NCGC00013037-01,Thanite,Inactive,N#CSCC(=O)O[C@@H]1C[C@H]2CC[C@]1(C)C2(C)C,,,,Inactive,,,...,,,,Inactive,Inactive,,,Inactive,,
4,NCGC00013042-01,8-Quinolinol Salicylic acid (1:1),Agonist,O=C(O)c1ccccc1O.Oc1cccc2cccnc12,,,,Inactive,,,...,,,,Inconclusive,Agonist,,,Inactive,,


In [5]:
from utils.cheminfo_utils import standardize_smiles
tqdm.pandas()

# standardize SMILES and drop rows where standardization failed
master_df = master_df.rename(columns={"SMILES": "TOX21 SMILES"}).copy()
master_df["std_SMILES"] = master_df["TOX21 SMILES"].progress_apply(standardize_smiles)
master_df = master_df[master_df["std_SMILES"].notnull() & (master_df["std_SMILES"] != "")]
master_df.reset_index(drop=True, inplace=True)

# non-assay columns to keep
non_assay_cols = ["SAMPLE_NAME", "TOX21 SMILES", "std_SMILES"]
if "SAMPLE_ID" in master_df.columns:
    non_assay_cols.insert(1, "SAMPLE_ID")

100%|██████████| 13128/13128 [02:11<00:00, 100.12it/s]


In [6]:
def process_assay_value(val, col_name):
    """
    Normalize any 'Active'/'Inactive' readout from agonist/antagonist assays.
    Input values are expected to be one of: 'Inactive', 'Agonist', 'Antagonist', 'Inconclusive', NaN.
    Output: 'Active', 'Inactive', or '' (empty for inconclusive/NA).
    """
    if pd.isna(val):
        return ""
    vs = str(val).strip()
    name = col_name.lower()

    # antagonists → Active means 'antagonist' match, inactive means 'inactive'
    if "antagonist" in name:
        if "Inactive" in vs:
            return "Inactive"
        if "Antagonist" in vs:
            return "Active"
        return ""

    # agonists → Active means 'agonist' match, inactive means 'inactive'
    if "agonist" in name:
        if "Inactive" in vs:
            return "Inactive"
        if "Agonist" in vs:
            return "Active"
        return ""

    # fallback (rare)
    if "Inactive" in vs:
        return "Inactive"
    if "Active" in vs:
        return "Active"
    return ""

# find which master columns correspond to the allowed assays
master_cols_lower = {c.lower(): c for c in master_df.columns}
assay_cols_present = [master_cols_lower[a] for a in allowed_assays_lower if a in master_cols_lower]

# process each assay column into normalized 'Active'/'Inactive' strings
processed = {}
new_assay_cols = []

for col in assay_cols_present:
    lower = col.lower()
    prefix = col.split("-")[0].upper()  # e.g., 'PPARG'
    if ("agonist" in lower) or ("antagonist" in lower):
        kind = "antagonist" if "antagonist" in lower else "agonist"
        new = f"{prefix} {kind} / {col}"
        new_assay_cols.append(new)
        processed[new] = master_df[col].apply(lambda v, c=col: process_assay_value(v, c))
    else:
        # split into two synthesized columns if a generic assay name ever appears
        ago = f"{prefix} agonist / {col}"
        ant = f"{prefix} antagonist / {col}"
        new_assay_cols.extend([ago, ant])
        processed[ago] = master_df[col].apply(lambda v, c=ago: process_assay_value(v, c))
        processed[ant] = master_df[col].apply(lambda v, c=ant: process_assay_value(v, c))

df_assay = pd.DataFrame(processed, index=master_df.index)
df_wide = pd.concat([master_df[non_assay_cols], df_assay], axis=1)

In [7]:
def unanimous_activity(series: pd.Series) -> str:
    """
    If all non-empty values are the same ('Active' or 'Inactive') → that label; else 'Inconclusive'.
    """
    nonempty = series[series != ""]
    if len(nonempty) == 0:
        return "Inconclusive"
    if (nonempty == "Active").all():
        return "Active"
    if (nonempty == "Inactive").all():
        return "Inactive"
    return "Inconclusive"

def aggregate_by_std_smiles(grp: pd.DataFrame) -> pd.Series:
    out = {}
    # carry through non-assay columns (join differing values with ' ;; ' conservatively)
    for c in non_assay_cols:
        vals = grp[c].dropna().astype(str).tolist()
        out[c] = vals[0] if len(set(vals)) == 1 else " ;; ".join(sorted(set(vals))) if vals else ""
    # unanimity per assay column
    for c in new_assay_cols:
        out[c] = unanimous_activity(grp[c])
    return pd.Series(out)

final_df = df_wide.groupby("std_SMILES", as_index=False).apply(aggregate_by_std_smiles).reset_index(drop=True)

# optional: put columns in a clean order
final_cols = non_assay_cols + new_assay_cols
final_df = final_df[final_cols]
final_df.head()

Unnamed: 0,SAMPLE_NAME,SAMPLE_ID,TOX21 SMILES,std_SMILES,AR agonist / ar-bla-agonist-p1,AR antagonist / ar-bla-antagonist-p1,CAR agonist / car-agonist-p1,CAR antagonist / car-antagonist-p1,ER agonist / er-bla-agonist-p2,ER antagonist / er-bla-antagonist-p1,...,PR agonist / pr-bla-agonist-p1,PR antagonist / pr-bla-antagonist-p1,PXR agonist / pxr-p1,PXR antagonist / pxr-p1,RAR agonist / rar-agonist-p1,RAR antagonist / rar-antagonist-p2,ROR antagonist / ror-cho-antagonist-p1,RXR agonist / rxr-bla-agonist-p1,VDR agonist / vdr-bla-agonist-p1,VDR antagonist / vdr-bla-antagonist-p1
0,Bromoform,NCGC00257743-01,BrC(Br)Br,BrC(Br)Br,Inactive,Inactive,Inactive,Inconclusive,Inactive,Inactive,...,Inconclusive,Inactive,Inactive,Inactive,Inactive,Inconclusive,Inactive,Inactive,Inactive,Inactive
1,Pentabromoethane,NCGC00258125-01,BrC(Br)C(Br)(Br)Br,BrC(Br)C(Br)(Br)Br,Inactive,Inconclusive,Inactive,Inconclusive,Inactive,Inconclusive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Active,Inconclusive,Inconclusive,Inactive,Inconclusive
2,"1,1,2,2-Tetrabromoethane",NCGC00257918-01,BrC(Br)C(Br)Br,BrC(Br)C(Br)Br,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
3,"(E)-1,4-Dibromo-2-butene",NCGC00260276-01,BrC\C=C\CBr,BrC/C=C/CBr,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
4,"1,2,5,6,9,10-Hexabromocyclododecane",NCGC00257050-01 ;; NCGC00258953-01,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,BrC1CCC(Br)C(Br)CCC(Br)C(Br)CCC1Br,Inactive,Inconclusive,Inactive,Active,Inactive,Inactive,...,Inactive,Active,Active,Inconclusive,Inactive,Active,Inconclusive,Inconclusive,Inactive,Inactive


In [8]:
final_df.to_excel(output_excel, index=False)
print(f"✅ Wrote standardized results to: {output_excel}")

✅ Wrote standardized results to: data/StandardizedTox21Results.xlsx
