In [None]:
import logging
from io import StringIO

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from datasets import load_dataset

from src.model import ChemicalMetaRegressor

np.random.seed(42)
ds = load_dataset("openadmet/openadmet-expansionrx-challenge-train-data")
ds_test = load_dataset("openadmet/openadmet-expansionrx-challenge-test-data-blinded")
train_df = ds["train"].to_pandas().sample(100)
test_df = ds_test["test"].to_pandas()

In [None]:
data = """Assay,Log_Scale,Multiplier,Log_name
LogD,False,1,LogD
KSOL,True,1e-6,LogS
HLM CLint,True,1,Log_HLM_CLint
MLM CLint,True,1,Log_MLM_CLint
Caco-2 Permeability Papp A>B,True,1e-6,Log_Caco_Papp_AB
Caco-2 Permeability Efflux,True,1,Log_Caco_ER
MPPB,True,1,Log_Mouse_PPB
MBPB,True,1,Log_Mouse_BPB
MGMB,True,1,Log_Mouse_MPB
"""
s = StringIO(data)
conversion_df = pd.read_csv(s)
conversion_dict = dict([(x[0], x[1:]) for x in conversion_df.values])
log_train_df = train_df[["SMILES", "Molecule Name"]].copy()
for col in train_df.columns[2:]:
    log_scale, multiplier, short_name = conversion_dict[col]
    log_train_df[short_name] = train_df[col].astype(float)
    if log_scale:
        log_train_df[short_name] = log_train_df[short_name] + 1
        log_train_df[short_name] = np.log10(log_train_df[short_name] * multiplier)

figure, axes = plt.subplots(3, 3, figsize=(10, 7))
axes = axes.flatten()
log_col_names = log_train_df.columns[2:]
for i, col in enumerate(log_col_names):
    ax = sns.histplot(log_train_df[col], ax=axes[i])
    ax.set_title(f"N = {len(log_train_df.dropna(subset=col))}")
plt.tight_layout()

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s [%(module)s]: %(message)s",
)

reg = ChemicalMetaRegressor(
    smiles_col="SMILES",
    target_cols=log_train_df.columns[2:-1],
    training_data=log_train_df,
)
reg.train_models(n_keep_classical=3)

In [None]:
test_df = reg.predict(test_df["SMILES"])
test_df["Molecule Name"] = ds_test["test"]["Molecule Name"]
test_df

In [None]:
import warnings

log_train_df["dataset"] = "train"
test_df["dataset"] = "test"
combo_cols = ["Molecule Name", "dataset"] + list(log_col_names)
combo_df = pd.concat([log_train_df[combo_cols], test_df[combo_cols]], ignore_index=True)
combo_df[list(log_col_names)] = combo_df[list(log_col_names)]
# transform the data to "tall" format
melt_df = combo_df.melt(id_vars=["Molecule Name", "dataset"])
melt_df.columns = ["Molecule Name", "dataset", "Assay", "Log_Value"]
# plot the box plot
ax = sns.boxplot(x="Assay", y="Log_Value", hue="dataset", data=melt_df)
labels = [x.get_text() for x in ax.get_xticklabels()]
with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    ax.set_xticklabels(["\n".join(x.split("_")) for x in labels])

In [None]:
reverse_dict = dict([(x[-1], x[0:-1]) for x in conversion_df.values])
test_df = test_df.reset_index()
output_df = test_df[["SMILES", "Molecule Name"]].copy()
for col in test_df.columns:
    if col not in reverse_dict:
        continue
    orig_name, log_scale, multiplier = reverse_dict[col]
    output_df[orig_name] = test_df[col]
    if log_scale:
        output_df[orig_name] = 10 ** output_df[orig_name] * 1 / multiplier - 1

output_df.to_csv("submission.csv", index=False)
output_df