In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import openpyxl
import xlrd
from rdkit import Chem
from rdkit.Chem import inchi
import papyrus_scripts
from chembl_structure_pipeline import standardizer as ChEMBL_standardizer
from papyrus_structure_pipeline import standardizer as Papyrus_standardizer
from papyrus_structure_pipeline import standardize

In [None]:

from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

Query data from Papyrus

In [None]:

from papyrus_scripts import download_papyrus
download_papyrus(only_pp=False)

In [None]:
from papyrus_scripts import read_papyrus
data = read_papyrus(plusplus=False,chunksize=1_000_000)

In [None]:
efflux_ac_list = ['P08183','Q9UNQ0','P33527','Q92887','015438','015439','015440','095255']

In [None]:
from papyrus_scripts import read_protein_set
protein_data = read_protein_set()

In [None]:
from papyrus_scripts import keep_quality
from papyrus_scripts import keep_accession

filter1=keep_quality(data,'High')
filter2=keep_accession(filter1, efflux_ac_list)

In [None]:
from papyrus_scripts import consume_chunks
efflux = consume_chunks(filter2)

In [None]:
print(len(efflux))
efflux.head()

#Check for molecules that are present both in training and validation dataset: remove from validation

In [None]:
train = pd.read_csv('train_data/kadar_efflux_train.csv', delimiter=',', index_col=0)
mask = efflux['connectivity'].isin(train['inchi_connectivity'])
common_values_list = efflux[mask]['connectivity'].tolist()
print(f'Common: {len(common_values_list)}')

efflux_val = efflux[~mask]
print(f'Final validation: {len(efflux_val)}')

In [None]:
#Keep the item with the highest pchembl value
val_efflux = efflux_val.loc[efflux_val.groupby('InChIKey')['pchembl_value_Mean'].idxmax()]

In [None]:
#Classify

def compare_values(row):
    if row['pchembl_value_Mean'] <= 5:
        return 'Non-substrate'
    elif row['pchembl_value_Mean'] > 5:
        return 'Substrate'
    else:
        return '??'

val_efflux['status_efflux'] = val_efflux.apply(compare_values, axis=1)


In [None]:
#Keep only the relevant column, rename them
val_efflux = val_efflux[['SMILES','connectivity','InChIKey','pchembl_value_Mean','status_efflux']]
val_efflux = val_efflux.rename(columns={'connectivity': 'inchi_connectivity','SMILES':'papyrus_SMILES'})

In [None]:
#Code classes
val_efflux['status_efflux'] = val_efflux['status_efflux'].replace({'Substrate':1, 'Non-substrate':0})


In [None]:
#Reset index
val_efflux = val_efflux.reset_index(drop=True)

In [None]:
#Save the influx molecules
val_efflux.to_csv('val_data/kadar_efflux_val.csv', index=True)