In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import openpyxl
import xlrd
from rdkit import Chem
from rdkit.Chem import inchi
from chembl_structure_pipeline import standardizer as ChEMBL_standardizer
from papyrus_structure_pipeline import standardizer as Papyrus_standardizer
from papyrus_structure_pipeline import standardize

from papyrus_scripts import *

In [None]:
#Download the papyrus database
download_papyrus(only_pp=False)

In [None]:
#Define Bcl-2 as target to filter for
bcl_ac = ['P10415','A0A1L4AQQ4','A0A1L4AQQ5','A0A1L4AQQ8','A0A1L4AQR0', 'A0A1L4AQR6','A0A7I2V3S7','A0A7I2V5Q7']

#Define Mcl-2 as target to filter for
mcl_ac = ['Q07820','C8YZ26','A0A7I2V2W2','A0A089WT64']

Bc-2 data

In [None]:
data = read_papyrus(plusplus=True,chunksize=1_000_000)

In [None]:
#Read the protein data
protein_data = read_protein_set()

In [None]:
#Keep only high quality data
filter1=keep_quality(data,'High')

In [None]:
#Filter the data
filter_bcl=keep_accession(filter1, bcl_ac)


In [None]:
#Create the dataset
bcl = consume_chunks(filter_bcl)

In [None]:
#Save the raw query result
bcl.to_csv('bcl/bcl2_raw.csv',index=True)

Mcl-1 data

In [None]:
data = read_papyrus(plusplus=True,chunksize=1_000_000)

In [None]:
#Read the protein data
protein_data = read_protein_set()

In [None]:
#Keep only high quality data
filter1=keep_quality(data,'High')

In [None]:
#Filter the data
filter_mcl=keep_accession(filter1, mcl_ac)

In [None]:
#Create the dataset
mcl = consume_chunks(filter_mcl)

In [None]:
#Save the raw query result
mcl.to_csv('mcl/mcl1_raw.csv',index=True)

Clean

In [None]:
#keep only the relevant columns 
df_bcl = bcl[['SMILES','InChIKey','connectivity','pchembl_value_Mean']]
df_mcl = mcl[['SMILES','InChIKey','connectivity','pchembl_value_Mean']]

In [None]:
#Classify compounds

def classification(row):
    if row['pchembl_value_Mean'] < 6.5:
        return 'Non-inhibitor'
    elif row['pchembl_value_Mean'] >= 6.5:
        return 'Inhibitor'
    else:
        return '??'

df_bcl['Class'] = df_bcl.apply(classification, axis=1)
df_mcl['Class'] = df_mcl.apply(classification, axis=1)


In [None]:
#Check for duplicates 

df = df_bcl #Change the dataset here, to check for duplicates

print(len(df))
print(df['connectivity'].nunique())
print(df['connectivity'].value_counts())

#Check for contradicting duplicates
unique_counts_log = df.groupby('connectivity')['Class'].nunique()
duplicates_diff_class_log = unique_counts_log[unique_counts_log > 1].index

print(duplicates_diff_class_log)
print(f'Contradicting duplicates: {len(duplicates_diff_class_log)}')

df = df.reset_index(drop=True)

#Save final datasets without duplicates: Change filenames!
df.to_csv('bcl/bcl_molecules.csv', index=True)