In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import openpyxl
import xlrd
from rdkit import Chem
from rdkit.Chem import inchi
import papyrus_scripts
from chembl_structure_pipeline import standardizer as ChEMBL_standardizer
from papyrus_structure_pipeline import standardizer as Papyrus_standardizer
from papyrus_structure_pipeline import standardize

In [None]:

from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

Query data from Papyrus

In [None]:
from papyrus_scripts import download_papyrus
download_papyrus(only_pp=False)

In [None]:
from papyrus_scripts import read_papyrus
data = read_papyrus(plusplus=False,chunksize=1_000_000)

In [None]:
influx_ac_list = ['O76082','Q96FL8','P46721','O94956','Q9NYB5','Q6ZNC8','Q6ZWT7','Q95T53','Q01650']

In [None]:
from papyrus_scripts import read_protein_set
protein_data = read_protein_set()

In [None]:
from papyrus_scripts import keep_quality
from papyrus_scripts import keep_accession
filter1=keep_quality(data,'High')
filter2=keep_accession(filter1, influx_ac_list)

In [None]:
from papyrus_scripts import consume_chunks
influx = consume_chunks(filter2)

In [None]:
len(influx)

In [None]:
#Check for molecules that are present both in training and validation dataset: remove from validation

train = pd.read_csv('train_data/kadar_influx_train.csv', delimiter=',', index_col=0)

In [None]:
mask = influx['connectivity'].isin(train['inchi_connectivity'])
common_values_list = influx[mask]['connectivity'].tolist()
print(f'Common: {len(common_values_list)}')

influx_val = influx[~mask]
print(f'Final: {len(influx_val)}')

In [None]:
#Keep the instance with the highest pchembl

val_influx = influx_val.loc[influx_val.groupby('InChIKey')['pchembl_value_Mean'].idxmax()]

In [None]:
#Assign classes

def compare_values(row):
    if row['pchembl_value_Mean'] <= 5:
        return 'Non-substrate'
    elif row['pchembl_value_Mean'] > 5:
        return 'Substrate'
    else:
        return '??'

val_influx['Class'] = val_influx.apply(compare_values, axis=1)

In [None]:
#Keep only the relevant columns, code classes, and rename column

val_influx = val_influx[['SMILES','connectivity','InChIKey','pchembl_value_Mean','Class']]
val_influx['Class'] = val_influx['Class'].replace({'Substrate':1, 'Non-substrate':0})
val_influx.rename(columns={'Class': 'status_influx', 'connectivity': 'inchi_connectivity','SMILES':'papyrus_SMILES'}, inplace=True)

In [None]:
#Reset index
val_influx = val_influx.reset_index(drop=True)

In [None]:
#Save the influx molecules
val_influx.to_csv('val_data/kadar_val_influx.csv', index=True)