In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import inchi
from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

from chembl_structure_pipeline import standardizer as ChEMBL_standardizer
from papyrus_structure_pipeline import standardize

[ForwardRef('ExtensionArray'), <class 'numpy.ndarray'>]
[ForwardRef('ExtensionArray'), <class 'numpy.ndarray'>, ForwardRef('Index'), ForwardRef('Series')]
[<class 'str'>, <class 'float'>, <class 'bool'>]
[ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta')]
[ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta'), ForwardRef('Interval')]
[<class 'str'>, <class 'float'>, <class 'bool'>, ForwardRef('Period'), ForwardRef('Timestamp'), ForwardRef('Timedelta'), ForwardRef('Interval'), <class 'numpy.datetime64'>, <class 'numpy.timedelta64'>, <class 'datetime.datetime'>]
[ForwardRef('Timestamp'), <class 'datetime.datetime'>, <class 'numpy.datetime64'>, <class 'numpy.int64'>, <class 'float'>, <class 'str'>]
[ForwardRef('Timedelta'), <class 'datetime.timedelta'>, <class 'numpy.timedelta64'>, <class 'numpy.int64'>, <class 'float'>, <class 'str'>]
[<class 'str'>, <class 'datetime.tzinfo'>]
[<class 'str'>, <class 'int'>]
[typing.Hashable, typing.Sequence[typing.Ha

[14:14:11] Initializing Normalizer


In [2]:
kpuu_raw =  pd.read_excel('Kpuu_raw.xlsx')

[typing.List[int], typing.List[str]]


In [3]:
kpuu_raw.head()

Unnamed: 0,SMILES,"Kp,uu,brain",species,Ref_DOI
0,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(C)CC1)C1=CC(OC(...,0.38,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150
1,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(CC1)C1COC1)C1=C...,0.15,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150
2,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCOCC1)C1=CC(OC(F)(...,0.43,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150
3,CCN1N=C(C2=C1C(=O)N(C=C2)[C@@H]1CCOC[C@@H]1OC1...,0.22,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150
4,CCO[C@H]1COCC[C@H]1N1C=CC2=C(N(CC)N=C2C2=CC(OC...,0.3,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150


In [4]:
def compare_values(row):
    if row['Kp,uu,brain'] > 0.33:
        return 'active'
    elif row['Kp,uu,brain'] <= 0.1:
        return 'inactive'
    else:
        return '?'

kpuu_raw['status_activity'] = kpuu_raw.apply(compare_values, axis=1)

kpuu_raw.head(10)

Unnamed: 0,SMILES,"Kp,uu,brain",species,Ref_DOI,status_activity
0,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(C)CC1)C1=CC(OC(...,0.38,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,active
1,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(CC1)C1COC1)C1=C...,0.15,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,?
2,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCOCC1)C1=CC(OC(F)(...,0.43,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,active
3,CCN1N=C(C2=C1C(=O)N(C=C2)[C@@H]1CCOC[C@@H]1OC1...,0.22,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,?
4,CCO[C@H]1COCC[C@H]1N1C=CC2=C(N(CC)N=C2C2=CC(OC...,0.3,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,?
5,CCN1N=C(C2=C1C(=O)N(C=C2)[C@@H]1CCOC[C@@H]1OCC...,0.09,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,inactive
6,CCC1=NC(=C2C=CC(=CN12)[C@H]1CN(C)C(=O)C1)C1=CC...,0.08,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,inactive
7,CCC1=NC(=C2C=CC(=CN12)C1CCN(C)CC1)C1=CN=C(N)C(...,0.18,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,?
8,CCC1=NC(=C2C=CC(=CN12)[C@@H]1COCCN1C)C1=CC(=C(...,0.24,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,?
9,CN1CCCC[C@H]1c1ccc2c(-c3cnc(N)c(C(F)(F)F)c3)nc...,1.2,mouse,Lawrenz_2023:10.1021/acs.jcim.3c00150,active


In [5]:
kpuu_raw = kpuu_raw.rename(columns={'SMILES': 'SMILES_raw'})

columns_to_keep = ['SMILES_raw', 'status_activity', 'species']
kpuu = kpuu_raw[columns_to_keep]


In [6]:
#Remove rows without SMILES

def remove_nan_smiles(df):

    df = df[~(df['SMILES_raw'].isna())]
    df = df.reset_index(drop=True)

    return df

In [7]:
#Papyrus Standardization

def create_sd_smiles(sd_mol):
    try:
        standardized_smiles =  Chem.MolToSmiles(sd_mol)
        return standardized_smiles
    except Exception as e:
        print(f"An sd_smiles error occurred: {str(e)}")
        return None
    
#Create InChI keys from standardized molecules
def mol_to_inchi_key(sd_mol):
    if sd_mol is not None:

        inchi_str = inchi.MolToInchi(sd_mol)
        inchi_key = inchi.InchiToInchiKey(inchi_str)
    else:
        inchi_key = None   
    return inchi_key

def standardize_molecule(mol):
    standardized_mol =  standardize(mol,raise_error=False )
    return standardized_mol

#Standardize 

def standardize_workflow(df_raw):
    for i in range(0,len(df_raw)):
        smiles =df_raw.at[i,'SMILES_raw']
        mol = Chem.MolFromSmiles(smiles)
        sd_mol =  standardize_molecule(mol)
        sd_smiles = create_sd_smiles(sd_mol)
        sd_inchi_key = mol_to_inchi_key(sd_mol)
        df_raw.at[i,'papyrus_SMILES'] = sd_smiles
        df_raw.at[i,'papyrus_inchi_key'] = sd_inchi_key

    print(f'df length after standardization: {len(df_raw)}')

    return df_raw

In [8]:
#Check for missing inchi key

def missing_inchi(df_raw):
    smiles_nan = df_raw['papyrus_SMILES'].isna().sum()
    inchikey_nan =df_raw['papyrus_inchi_key'].isna().sum()
    print(f'DB length: {len(df_raw)},        SMILES nan: {smiles_nan},        inchi key nan: {inchikey_nan}')

    #Remove rows with missing inchikey
    df_valid_inchi= df_raw[((df_raw['papyrus_inchi_key'].notna()))]
    print('-----remove missing inchikey----')
    print(f'updated length: {len(df_valid_inchi)}')

    return df_valid_inchi

In [9]:
def inchi_first_part(inchi):
    return inchi.split('-')[0]

def create_connectivity_inchi(df):
    df['inchi_connectivity'] = df['papyrus_inchi_key'].apply(inchi_first_part)
    
    return df

In [10]:
#Check for duplicates

def remove_duplicates(df):
    print(f'length: {len(df)}')
    inchi_un = df['inchi_connectivity'].nunique()
    print(f'unique_inchi: {inchi_un}')


    unique_counts = df.groupby('inchi_connectivity')['status_activity'].nunique()
    contradicting_duplicates = unique_counts[unique_counts > 1].index

    print(f'Contradicting duplicates: {len(contradicting_duplicates)}')

    # Separate contradicting duplicates into a new dataframe
    duplicates_df = df[df['inchi_connectivity'].isin(contradicting_duplicates)].copy()

    # Remove contradicting duplicates from the original dataframe
    df = df[~df['inchi_connectivity'].isin(contradicting_duplicates)].copy()

    print(f'Original dataframe after contradicting duplicates removed: {len(df)}')
    print(f'Original dataframe unique_inchi after removal: {df["inchi_connectivity"].nunique()}')

    df=df.drop_duplicates(subset=['inchi_connectivity'], keep="first").reset_index(drop=True)
    
    print(f'Original dataframe after non-contradicting duplicates removed: {len(df)}')

    return df, duplicates_df

In [11]:
df = kpuu

df = remove_nan_smiles(df)

df_sd = standardize_workflow(df)
df_sd.to_csv('kpuu_standardized_val.csv')

df_valid = missing_inchi(df_sd)
df_valid.to_csv('kpuu_have_inchi_key_val.csv')

df_connectivity_inchi = create_connectivity_inchi(df_valid)
df_connectivity_inchi.to_csv('kpuu_connectivity_inchi_val.csv')



An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)










An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::






An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)








An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)





An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)













An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)











An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)









An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::






An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)








An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)







An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::
















An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::














An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)

















An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)
An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::












An sd_smiles error occurred: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(class RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(class RDKit::ROMol mol, struct RDKit::SmilesWriteParams params)









df length after standardization: 436
DB length: 436,        SMILES nan: 29,        inchi key nan: 29
-----remove missing inchikey----
updated length: 407




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['inchi_connectivity'] = df['papyrus_inchi_key'].apply(inchi_first_part)


In [12]:
df_no_duplicates, df_contra_duplicates = remove_duplicates(df_connectivity_inchi)
df_no_duplicates.to_csv('kpuu_no_duplicates_val.csv')
df_contra_duplicates.to_csv('kpuu_contradicting_duplicates.csv')

length: 407
unique_inchi: 311
Contradicting duplicates: 41
Original dataframe after contradicting duplicates removed: 297
Original dataframe unique_inchi after removal: 270
Original dataframe after non-contradicting duplicates removed: 270


Further curate contradicting


In [13]:
#Further curate contradicting


print(f'length: {len(df)}')

unique_counts = df_contra_duplicates.groupby('inchi_connectivity')['species'].nunique()
same_species = unique_counts[unique_counts >  1].index

print(f'Dupicates with same species: {len(same_species)}')


# Create a mask to identify the first entries of duplicates with the same species
df_unique = df_contra_duplicates.drop_duplicates(subset=['inchi_connectivity', 'species'], keep='first').reset_index(drop=True)

print(f'Length after inchi-key AND species duplicates are removed: {len(df_unique)}')



length: 436
Dupicates with same species: 39
Length after inchi-key AND species duplicates are removed: 101


In [14]:
df_unique.to_csv('kpuu_contradicting_duplicates_once.csv')

Remove training molecules

In [15]:
influx = pd.read_csv('../../../3_Combined/31_Combined_datasets/1_data_curation/influx/combined_influx_train_raw.csv')
efflux = pd.read_csv('../../../3_Combined/31_Combined_datasets/1_data_curation/efflux/combined_efflux_train_raw.csv')
pampa = pd.read_csv('../../../3_Combined/31_Combined_datasets/1_data_curation/pampa/combined_pampa_train_raw.csv')
bbb = pd.read_csv('../../../3_Combined/31_Combined_datasets/1_data_curation/bbb/combined_bbb_train_raw.csv')

In [16]:
mask = df_no_duplicates['inchi_connectivity'].isin(influx['inchi_connectivity'])
common_values_list = df_no_duplicates[mask]['inchi_connectivity'].tolist()
print(f'Common molecules {len(common_values_list)}')
kpuu_val = df_no_duplicates[~mask]
kpuu_val = kpuu_val.reset_index(drop=True)
print(f'After removed: {len(kpuu_val)}')

Common molecules 28
After removed: 242


In [17]:
kpuu_val.head()

Unnamed: 0,SMILES_raw,status_activity,species,papyrus_SMILES,papyrus_inchi_key,inchi_connectivity
0,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(C)CC1)C1=CC(OC(...,active,mouse,CCn1nc(-c2cnc(N)c(OC(F)(F)F)c2)c2ccn(C3CCN(C)C...,NZCZMTGWHGOXJY-UHFFFAOYSA-N,NZCZMTGWHGOXJY
1,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCN(CC1)C1COC1)C1=C...,?,mouse,CCn1nc(-c2cnc(N)c(OC(F)(F)F)c2)c2ccn(C3CCN(C4C...,ILMJTNRZTPMOFU-UHFFFAOYSA-N,ILMJTNRZTPMOFU
2,CCN1N=C(C2=C1C(=O)N(C=C2)C1CCOCC1)C1=CC(OC(F)(...,active,mouse,CCn1nc(-c2cnc(N)c(OC(F)(F)F)c2)c2ccn(C3CCOCC3)...,HSHPTVFAMKYPCX-UHFFFAOYSA-N,HSHPTVFAMKYPCX
3,CCN1N=C(C2=C1C(=O)N(C=C2)[C@@H]1CCOC[C@@H]1OC1...,?,mouse,CCn1nc(-c2cnc(N)c(OC(F)(F)F)c2)c2ccn([C@@H]3CC...,DLVDYHZYZUUGRK-WBVHZDCISA-N,DLVDYHZYZUUGRK
4,CCO[C@H]1COCC[C@H]1N1C=CC2=C(N(CC)N=C2C2=CC(OC...,?,mouse,CCO[C@H]1COCC[C@H]1n1ccc2c(-c3cnc(N)c(OC(F)(F)...,AOVQXUIMTPPLED-ZBFHGGJFSA-N,AOVQXUIMTPPLED


In [18]:
mask = kpuu_val['inchi_connectivity'].isin(efflux['inchi_connectivity'])
common_values_list = kpuu_val[mask]['inchi_connectivity'].tolist()
print(f'Common molecules {len(common_values_list)}')
kpuu_val = kpuu_val[~mask]
kpuu_val = kpuu_val.reset_index(drop=True)
print(f'After removed: {len(kpuu_val)}')

Common molecules 59
After removed: 183


In [19]:
mask = kpuu_val['inchi_connectivity'].isin(pampa['inchi_connectivity'])
common_values_list = kpuu_val[mask]['inchi_connectivity'].tolist()
print(f'Common molecules {len(common_values_list)}')
kpuu_val = kpuu_val[~mask]
kpuu_val = kpuu_val.reset_index(drop=True)
print(f'After removed: {len(kpuu_val)}')

Common molecules 4
After removed: 179


In [20]:
mask = kpuu_val['inchi_connectivity'].isin(bbb['inchi_connectivity'])
common_values_list = kpuu_val[mask]['inchi_connectivity'].tolist()
print(f'Common molecules {len(common_values_list)}')
kpuu_val = kpuu_val[~mask]
kpuuval = kpuu_val.reset_index(drop=True)
print(f'After removed: {len(kpuu_val)}')

Common molecules 42
After removed: 137


In [22]:
kpuu_val.to_csv('kpuu_validation_not_everything_classified.csv')

Keep only classified

In [23]:
kpuu_val_classified = kpuu_val[kpuu_val['status_activity'].isin(['active', 'inactive'])]
len(kpuu_val_classified)

114

In [24]:
kpuu_val_classified.to_csv('kpuu_validation.csv', index=True)