In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import inchi
from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

from chembl_structure_pipeline import standardizer as ChEMBL_standardizer
from papyrus_structure_pipeline import standardize

In [None]:
#Read the raw efflux data (S: substrate, NS: non-substrate)
efflux_chembl =  pd.read_csv('raw_data/efflux/efflux_ChEMBL_query.csv', delimiter = ',')

BCRP1_S = Chem.SDMolSupplier('raw_data/efflux/BCRP1_efflux_metrabase_S.sdf')
MDR1_S = Chem.SDMolSupplier('raw_data/efflux/MDR1_efflux_metrabase_S.sdf')
MRP1_S = Chem.SDMolSupplier('raw_data/efflux/MRP1_efflux_metrabase_S.sdf')
MRP2_S = Chem.SDMolSupplier('raw_data/efflux/MRP2_efflux_metrabase_S.sdf')
MRP3_S = Chem.SDMolSupplier('raw_data//efflux/MRP3_efflux_metrabase_S.sdf')
MRP4_S = Chem.SDMolSupplier('raw_data/efflux/MRP4_efflux_metrabase_S.sdf')

BCRP1_NS = Chem.SDMolSupplier('raw_data/efflux/BCRP1_efflux_metrabase_NS.sdf')
MDR1_NS = Chem.SDMolSupplier('raw_data/efflux/MDR1_efflux_metrabase_nonS.sdf')
MRP1_NS = Chem.SDMolSupplier('raw_data/efflux/MRP1_efflux_metrabase_nonS.sdf')
MRP2_NS = Chem.SDMolSupplier('raw_data/efflux/MRP2_efflux_metrabase_nonS.sdf')
MRP3_NS = Chem.SDMolSupplier('raw_data/efflux//MRP3_efflux_metrabase_nonS.sdf')
MRP4_NS = Chem.SDMolSupplier('raw_data/efflux/MRP4_efflux_metrabase_nonS.sdf')

In [None]:
#Create target lists for data and UniProt Accession
efflux_targets_S_data = [BCRP1_S, MDR1_S, MRP1_S, MRP2_S, MRP3_S, MRP4_S]
efflux_targets_NS_data = [BCRP1_NS, MDR1_NS, MRP1_NS, MRP2_NS, MRP3_NS, MRP4_NS]
efflux_targets_UniProt= ['Q9UNQ0','P08183','P33527','Q92887','O15438','O15439']

In [None]:
#Classify compounds based on pchembl value (if >5 then substrate) and standard_Value (if > 10 000 nM)

efflux_chembl['status_efflux'] = efflux_chembl['pchembl_value'].apply(lambda pchembl: 'Substrate' if pchembl > 5 else 'Non-substrate')

efflux_chembl.head(10)

In [None]:
efflux_df = efflux_chembl

#Add substrates
for i in range(len(efflux_targets_S_data)):
    try:
        for mol in efflux_targets_S_data[i]:
            if mol is not None:
                
                smiles = Chem.MolToSmiles(mol)
                inchi_str = inchi.MolToInchi(mol)
                inchi_key = inchi.InchiToInchiKey(inchi_str)

                new_row = {'Molregno': 'Unknown', 'pchembl_value': 'Unknown', 
                           'canonical_smiles': smiles, 'standard_inchi_key': inchi_key, 
                           'accession': efflux_targets_UniProt[i],'status_efflux': 'Substrate'}
                efflux_df.loc[len(efflux_df)] = new_row
            
    except Exception as e:
            print(f"An error occurred with smile and inchi creation")
            continue 

#Add non-substrates
for i in range(len(efflux_targets_NS_data)):
    try: 
        for mol in efflux_targets_NS_data[i]:
            if mol is not None:
               
                smiles = Chem.MolToSmiles(mol)
                inchi_str = inchi.MolToInchi(mol)
                inchi_key = inchi.InchiToInchiKey(inchi_str)

               
                new_row = {'Molregno': 'Unknown', 'pchembl_value': 'Unknown', 
                           'canonical_smiles': smiles, 'standard_inchi_key': inchi_key, 
                           'accession': efflux_targets_UniProt[i],'status_efflux': 'Non-substrate'}
                efflux_df.loc[len(efflux_df)] = new_row

    except Exception as e:
            print(f"An error occurred with smiles and inchikey creation")
            continue 

In [None]:
efflux_df.to_csv("raw_data/efflux/efflux_train_merged_query_results.csv", index=True)

In [None]:
efflux_df.head()

In [None]:
#Keep the relevant column

df=efflux_df.rename(columns={'canonical_smiles':'SMILES_raw'})
df = df[['SMILES_raw','status_efflux']]


In [None]:
#Remove rows without SMILES

def remove_nan_smiles(df):

    df = df[~(df['SMILES_raw'].isna())]
    df = df.reset_index(drop=True)

    return df

In [None]:
#Papyrus Standardization

def create_sd_smiles(sd_mol):
    try:
        standardized_smiles =  Chem.MolToSmiles(sd_mol)
        return standardized_smiles
    except Exception as e:
        print(f"An sd_smiles error occurred: {str(e)}")
        return None
    
#Create InChI keys from standardized molecules
def mol_to_inchi_key(sd_mol):
    if sd_mol is not None:

        inchi_str = inchi.MolToInchi(sd_mol)
        inchi_key = inchi.InchiToInchiKey(inchi_str)
    else:
        inchi_key = None   
    return inchi_key

def standardize_molecule(mol):
    standardized_mol =  standardize(mol,raise_error=False )
    return standardized_mol

#Standardize 

def standardize_workflow(df_raw):
    for i in range(0,len(df_raw)):
        smiles =df_raw.at[i,'SMILES_raw']
        mol = Chem.MolFromSmiles(smiles)
        sd_mol =  standardize_molecule(mol)
        sd_smiles = create_sd_smiles(sd_mol)
        sd_inchi_key = mol_to_inchi_key(sd_mol)
        df_raw.at[i,'papyrus_SMILES'] = sd_smiles
        df_raw.at[i,'papyrus_inchi_key'] = sd_inchi_key

    print(f'df length after standardization: {len(df_raw)}')

    return df_raw

In [None]:
#Check for missing inchi key

def missing_inchi(df_raw):
    smiles_nan = df_raw['papyrus_SMILES'].isna().sum()
    inchikey_nan =df_raw['papyrus_inchi_key'].isna().sum()
    print(f'DB length: {len(df_raw)},        SMILES nan: {smiles_nan},        inchi key nan: {inchikey_nan}')

    #Remove rows with missing inchikey
    df_valid_inchi= df_raw[((df_raw['papyrus_inchi_key'].notna()))]
    print('-----remove missing inchikey----')
    print(f'updated length: {len(df_valid_inchi)}')

    return df_valid_inchi

In [None]:
#Create connectivity inchi

def inchi_first_part(inchi):
    return inchi.split('-')[0]

def create_connectivity_inchi(df):
    df['inchi_connectivity'] = df['papyrus_inchi_key'].apply(inchi_first_part)
    
    return df

In [None]:
#Check for duplicates

def remove_duplicates(df):

    print(f'length: {len(df)}')
    inchi_un = df['inchi_connectivity'].nunique()
    print(f'unique_inchi: {inchi_un}')

    unique_counts = df.groupby('inchi_connectivity')['status_efflux'].nunique()
    duplicates_diff_class = unique_counts[unique_counts > 1].index

    print(f'Contradicting duplicates: {len(duplicates_diff_class)}')

    #Remove duplicates
    df = df[~(df['inchi_connectivity'].isin(duplicates_diff_class))]
    print(len(df))
    print(df['inchi_connectivity'].nunique())

    df=df.drop_duplicates(subset=['inchi_connectivity'], keep="first").reset_index(drop=True)
    #df['inchi_stereo'].values_counts

    return df

In [None]:
#Code classes

def class_code(df):
    df['status_efflux'] = df['status_efflux'].replace({'Substrate': 1, 'Non-substrate': 0})

    return df


In [None]:
df = remove_nan_smiles(df)
df_sd = standardize_workflow(df)
df_valid = missing_inchi(df_sd)
df_connectivity_inchi = create_connectivity_inchi(df_valid)
df_no_duplicates = remove_duplicates(df_connectivity_inchi)
df_class_coded = class_code(df_no_duplicates)

df_class_coded.to_csv('train_data/kadar_efflux_train.csv')
