In [1]:
import pandas as pd
import os
from chembl_webresource_client.new_client import new_client as client
import pubchempy as pcp
import warnings
import traceback

In [2]:
warnings.filterwarnings("ignore")

In [3]:
def abs_file_path(rel_path):
    working_dir = os.getcwd()
    abs_file_path = os.path.join(working_dir, rel_path.replace("\\", "/"))
    return abs_file_path

In [4]:
def smiles_from_mol_id(list_mol_id):
    # returns a list of smiles strings for given list of mol ids
    list_smiles = []
    if "CHEMBL" in str(list_mol_id):
        for chembl_id in list_mol_id:
            molecule = client.molecule
            compound = molecule.filter(chembl_id=chembl_id)[0]
            list_smiles.append(compound['molecule_structures']["canonical_smiles"])
    else:
        for cid in list_mol_id:
            compound = pcp.Compound.from_cid(cid)
            smiles = compound.isomeric_smiles
            list_smiles.append(smiles)
    return list_smiles

In [5]:
def halflife_formatting(source_df, isozyme):
    # creates a correctly formatted list of half-life values from df
    list_halflife = []
    if isozyme == "3A4":
        df_adjusted = source_df
        list_halflife = df_adjusted["Standard Value"]
    if isozyme == "RLM":
        df_adjusted = source_df.replace({">30": '30'})
        list_halflife = df_adjusted["Half-life (minutes)"]
    if isozyme == "HLC":
        df_adjusted = source_df
        list_halflife = df_adjusted["Half-life"]
    return list_halflife

In [6]:
def chemprop_data_formatting(source_csv_file, isozyme, sep=","):
    # inputs are relative paths
    # creates a correctly formatted csv file for use with the chemprop library
    source_df = pd.read_csv(abs_file_path(source_csv_file), sep=sep)

    if isozyme + ".csv" in os.listdir(abs_file_path("project_resources")):
        print(f"{isozyme}.csv already exists in dir")
    else:
        if isozyme == "3A4":
            # additional formatting, since not all molecules have the desired property
            source_df = source_df[source_df["Standard Type"] == "T1/2"]
        try:
            mol_ids = source_df["Molecule ChEMBL ID"]
        except KeyError:
            mol_ids = source_df["PUBCHEM_CID"]
        final_df = pd.DataFrame()
        final_df["smiles"] = smiles_from_mol_id(mol_ids)
        final_df["half-life"] = list(halflife_formatting(source_df, isozyme))
        final_df.to_csv(abs_file_path(f"project_resources/{isozyme}.csv"), index=False)
        print(f"{isozyme}.csv was successfully created")