In [247]:
import pandas as pd
from rdkit.Chem import PandasTools
from rxitect.chem.utils import calc_fp

In [183]:
def lowercase_and_snake_case(string: str) -> str:
    string = string.lower()
    string = string.replace(" ", "_")
    return string

In [222]:
df = pd.read_table("../data/raw/ligand_activity.tsv", usecols=["Smiles", "Target ChEMBL ID", "pChEMBL Value", "Document Year", "Standard Type", "Standard Relation", "Comment"])

In [223]:
df.columns = [lowercase_and_snake_case(column_name) for column_name in df.columns]

In [224]:
df.document_year = df.document_year.astype("Int32")

In [225]:
df.dropna(subset=["smiles"], inplace=True)

In [226]:
df = df[~(df.pchembl_value < 4)]

In [230]:
doc_year = df.groupby("smiles")[["document_year"]].min().dropna()

In [191]:
df = df.set_index(["smiles", "target_chembl_id"])

In [192]:
positive_samples = df.pchembl_value.groupby(["smiles", "target_chembl_id"]).mean().dropna()

In [193]:
inactive = df[(df.comment.str.contains("Not Active") == True)]

In [194]:
inhibition = df[(df.standard_type == "Inhibition") & df.standard_relation.isin(["'<'", "'<='"])]

In [195]:
relation = df[(df.standard_type.isin(["EC50", "IC50", "Kd", "Ki"])) & df.standard_relation.isin(["'>'", "'>='"])]

In [196]:
negative_samples = pd.concat([inactive, inhibition, relation])

In [206]:
negative_samples = negative_samples[~negative_samples.index.isin(positive_samples.index)].copy()

In [207]:
negative_samples.pchembl_value = 3.99

In [209]:
negative_samples = negative_samples.pchembl_value.groupby(["smiles", "target_chembl_id"]).first()

In [214]:
qsar_data = pd.concat([positive_samples, negative_samples])

In [217]:
qsar_data = qsar_data.unstack("target_chembl_id")

In [219]:
qsar_data = qsar_data.sample(frac=1, random_state=123)

In [243]:
train_idx = qsar_data.index.intersection(doc_year[(doc_year.document_year > 2015) == False].index)
test_idx = qsar_data.index.intersection(doc_year[(doc_year.document_year > 2015) == True].index)

In [249]:
# qsar_data_train = qsar_data.loc[train_idx].reset_index("smiles")
# qsar_data_test = qsar_data.loc[test_idx].reset_index("smiles")

In [253]:
# PandasTools.AddMoleculeColumnToFrame(qsar_data_train, 'smiles', 'mol')
# PandasTools.AddMoleculeColumnToFrame(qsar_data_test, 'smiles', 'mol')

In [255]:
# qsar_data_train["fingerprint"] = qsar_data_train.mol.apply(calc_fp)
# qsar_data_test["fingerprint"] = qsar_data_test.mol.apply(calc_fp)

In [271]:
# X_train, y_train = qsar_data_train.fingerprint, qsar_data_train[["CHEMBL226", "CHEMBL240", "CHEMBL251"]]
# X_test, y_test = qsar_data_test.fingerprint, qsar_data_test[["CHEMBL226", "CHEMBL240", "CHEMBL251"]]

In [295]:
to_write = qsar_data.copy()
to_write["temporal_split_test"] = True
to_write["temporal_split_test"].loc[train_idx] = False
to_write.reset_index()
to_write.columns.names = [None]
to_write.to_csv("../data/processed/multitask_ligand_activity.tsv", sep="\t", index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_write["temporal_split_test"].loc[train_idx] = False
