# Data preparation

In [146]:
# import necessary packages
import pandas as pd
import numpy as np

## Load dbs

In [183]:
pubchem_db = r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\original_dbs\pubchem_human_5ht6.csv'
chembl_db = r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\original_dbs\chembl_human_5ht6.csv'

In [184]:
pubchem_df = pd.read_csv(pubchem_db, sep=',')
chembl_df = pd.read_csv(chembl_db, sep=';')

## Filter out unnecessary columns

In [185]:
pubchem_df.columns

Index(['baid', 'activity', 'aid', 'sid', 'cid', 'geneid', 'pmid', 'aidtype',
       'aidmdate', 'hasdrc', 'rnai', 'protacxn', 'acname', 'acqualifier',
       'acvalue', 'aidsrcname', 'aidname', 'cmpdname', 'targetname',
       'targeturl', 'ecs', 'repacxn', 'taxids', 'cellids', 'targettaxid'],
      dtype='object')

In [186]:
chembl_df.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties'],
      dtype='object')

In [187]:
pubchem_columns = ['sid', 'acname', 'acqualifier', 'acvalue', 'activity']
chembl_columns = ['Smiles', 'Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units', 'pChEMBL Value', 'Data Validity Comment']

In [188]:
pubchem_df = pubchem_df.drop(columns=[column for column in pubchem_df.columns if column not in pubchem_columns])
chembl_df = chembl_df.drop(columns=[column for column in chembl_df.columns if column not in chembl_columns])

## Add smiles to pubchem dataset

Smiles were obtained through Pubchem Identifier Exchange Service based on SID

In [189]:
pubchem_smiles = r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\original_dbs\pubchem_db_smiles.csv'
pubchem_smiles_df = pd.read_csv(pubchem_smiles, sep="\t")

In [190]:
pubchem_df = pd.merge(pubchem_df, pubchem_smiles_df, how='left')

In [191]:
pubchem_df.drop_duplicates(subset=['sid'], inplace=True)

In [192]:
# renaming columns so that they much ChEMBL dataset column names
pubchem_df = pubchem_df.rename(columns= {'acname': 'Standard Type', 'acqualifier': 'Standard Relation', 'acvalue': 'Standard Value'})

## Clean pubchem dataset

### Select and clean data for classification tasks

This dataset also contains molecules that do not have activity (Ki) measured. They inhibition concentrations were too large and they were deemed inactive.
This is not a problem for classification as inhibition constant should not be one of the features to allow for prediction based solely on structure and
physico-chemical properties (that are easily calculable).

In [193]:
pubchem_cl_df = pubchem_df.copy()

In [194]:
# Select rows with only clearly annotated activity state
pubchem_cl_df.query('activity == "Active" or activity == "Inactive"', inplace=True)

In [195]:
pubchem_cl_df

Unnamed: 0,activity,sid,Standard Type,Standard Relation,Standard Value,Smiles
0,Active,103430412,IC50,=,0.000700,CN(C)CCCC1=C(NC2=CC=CC=C21)C3=CC(=CC=C3)Br
2,Active,103192227,Ki,=,0.001000,C1CCN(C1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)N)Br
8,Active,103192262,Ki,=,0.002291,CNC1=NC(=CC(=C1)S(=O)(=O)C2=CC=C(C=C2)N)Br
9,Active,103192311,Ki,=,0.000115,C1CN(CCN1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)...
11,Active,103192312,Ki,=,0.053700,C1=CC(=CC=C1N)S(=O)(=O)C2=CC(=NC(=C2)Br)Br
...,...,...,...,...,...,...
26676,Inactive,461558376,,,,CC1=CC=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C3=...
26682,Inactive,461549557,,,,COC1=CC(=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C...
26684,Inactive,461563589,,,,CC(=O)N1CCC(CC1)[C@](C2=CC=CC=C2)(C3=CC4=C(C=C...
26692,Inactive,461517760,,,,COC1=CC=C(C=C1)S(=O)(=O)N2CCC(CC2)CCC(=O)C3=CC...


In [196]:
# Only Ki and nan will be selected
set(pubchem_cl_df['Standard Type'].values)

{'Activity', 'EC50', 'IC50', 'Inhibition', 'Kb', 'Kbapp', 'Kd', 'Ki', nan}

In [197]:
# First line selects only active or inactive ligands
# Second line selects rows with specific activity value or no value for inactive ligands
pubchem_cl_df.query('`Standard Type` == "Ki" or `Standard Type` != `Standard Type`', inplace=True)
pubchem_cl_df.query('`Standard Relation` == "=" or `Standard Relation` != `Standard Relation`', inplace=True)

In [198]:
pubchem_cl_df

Unnamed: 0,activity,sid,Standard Type,Standard Relation,Standard Value,Smiles
2,Active,103192227,Ki,=,0.001000,C1CCN(C1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)N)Br
8,Active,103192262,Ki,=,0.002291,CNC1=NC(=CC(=C1)S(=O)(=O)C2=CC=C(C=C2)N)Br
9,Active,103192311,Ki,=,0.000115,C1CN(CCN1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)...
11,Active,103192312,Ki,=,0.053700,C1=CC(=CC=C1N)S(=O)(=O)C2=CC(=NC(=C2)Br)Br
13,Active,103670002,Ki,=,0.054950,CNC1=CC(=NC(=N1)NC)NS(=O)(=O)C2=CC=C(C=C2)N.Br.Br
...,...,...,...,...,...,...
26676,Inactive,461558376,,,,CC1=CC=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C3=...
26682,Inactive,461549557,,,,COC1=CC(=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C...
26684,Inactive,461563589,,,,CC(=O)N1CCC(CC1)[C@](C2=CC=CC=C2)(C3=CC4=C(C=C...
26692,Inactive,461517760,,,,COC1=CC=C(C=C1)S(=O)(=O)N2CCC(CC2)CCC(=O)C3=CC...


In [199]:
# Create new column to assign activity
conditions = [pubchem_cl_df['Standard Value'] <= 10, pubchem_cl_df['Standard Value'] > 10, pubchem_cl_df['Standard Value'].isnull()]
values = [1, 0, 0]
pubchem_cl_df['Standard Activity'] = np.select(conditions, values)

In [200]:
pubchem_cl_df.reset_index(inplace=True, drop=True)
pubchem_cl_df = pubchem_cl_df.drop(columns=['activity', 'sid', 'Standard Type', 'Standard Relation'])
pubchem_cl_df['Standard Value'] = pubchem_cl_df['Standard Value'].multiply(1000)
pubchem_cl_df = pubchem_cl_df.round(4)

In [201]:
pubchem_cl_df

Unnamed: 0,Standard Value,Smiles,Standard Activity
0,1.0000,C1CCN(C1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)N)Br,1
1,2.2910,CNC1=NC(=CC(=C1)S(=O)(=O)C2=CC=C(C=C2)N)Br,1
2,0.1148,C1CN(CCN1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)...,1
3,53.7000,C1=CC(=CC=C1N)S(=O)(=O)C2=CC(=NC(=C2)Br)Br,1
4,54.9500,CNC1=CC(=NC(=N1)NC)NS(=O)(=O)C2=CC=C(C=C2)N.Br.Br,1
...,...,...,...
3206,,CC1=CC=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C3=...,0
3207,,COC1=CC(=C(C=C1)S(=O)(=O)NCCN2CCC(CC2)CCC(=O)C...,0
3208,,CC(=O)N1CCC(CC1)[C@](C2=CC=CC=C2)(C3=CC4=C(C=C...,0
3209,,COC1=CC=C(C=C1)S(=O)(=O)N2CCC(CC2)CCC(=O)C3=CC...,0


In [202]:
# Save pubchem classification dataset
pubchem_cl_df.to_csv(r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\clean_dbs\pubchem_cl_human_5ht6.csv', sep='\t')

### Select and clean data for regression tasks

In [203]:
pubchem_df = pubchem_df.drop(pubchem_df[pubchem_df['Standard Type'] != 'Ki'].index)
pubchem_df = pubchem_df.drop(pubchem_df[pubchem_df['Standard Relation'] != '='].index)
pubchem_df = pubchem_df.drop(pubchem_df[pubchem_df['activity'] != 'Active'].index)
pubchem_df = pubchem_df.drop(columns=['activity', 'Standard Relation', 'sid'])
pubchem_df.dropna(inplace=True)

In [204]:
# Convert mikro mol Ki to nmol
pubchem_df['Standard Value'] = pubchem_df['Standard Value'].multiply(1000)

In [205]:
# Create new pKi column by calculating -log(Ki)
pubchem_df['pKi'] = -np.log10(pubchem_df['Standard Value'] / (10 ** 9))
pubchem_df = pubchem_df.round(4)

In [206]:
pubchem_df.to_csv(r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\clean_dbs\pubchem_reg_human_5ht6.csv', sep='\t')

## Clean chembl dataset

In [252]:
chembl_df = chembl_df.drop(chembl_df[chembl_df['Standard Type'] != 'Ki'].index)
chembl_df = chembl_df.drop(chembl_df[chembl_df['Standard Relation'] != "'='"].index)
chembl_df.drop_duplicates(subset=['Smiles'], inplace=True)
chembl_df.dropna(inplace=True)

In [253]:
chembl_df = chembl_df.drop(columns=['Standard Relation', 'Standard Units'])

In [254]:
chembl_df

Unnamed: 0,Smiles,Standard Type,Standard Value,pChEMBL Value
0,CC(C)c1ccc(S(=O)(=O)n2cc(N3CCN(C)CC3)c3ccccc32...,Ki,3.400,8.47
1,COc1ccc(Cl)cc1NS(=O)(=O)c1ccc(OC)c(N2CCNCC2)c1,Ki,1.259,8.90
2,CNc1cc(S(=O)(=O)Nc2ccc(N)cc2)cc(Br)n1,Ki,42.660,7.37
4,CC([Se]c1ccccc1)c1nc(N)nc(N2CCN(C)CC2)n1,Ki,111.000,6.96
6,CN1CCN(c2nc(N)nc(CSc3ccccc3)n2)CC1,Ki,26.000,7.58
...,...,...,...,...
5193,COc1ccc(Cc2nc(N3CCNCC3)nc3cccnc23)c(OC)c1,Ki,58.000,7.24
5201,O=C1Cc2cc(CCN3CCN(c4nsc5ccccc45)CC3)c(Cl)cc2N1,Ki,20.000,7.70
5202,O=S(=O)(c1cccc2ncccc12)N1CCCCC[C@H]1CCN1CCN(c2...,Ki,176.000,6.75
5203,O=S(=O)(c1cncc2ccccc12)N1CC[C@@H]1CCN1CCN(c2cc...,Ki,104.000,6.98


In [266]:
chembl_df.to_csv(r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\clean_dbs\chembl_human_5ht6.csv', sep='\t', index=False)

## Concatenate the two dataframes

In [257]:
dataframes = [pubchem_df, chembl_df]
concat_df = pd.concat(dataframes)

In [258]:
concat_df

Unnamed: 0,Standard Type,Standard Value,Smiles,pChEMBL Value
2,Ki,1.0000,C1CCN(C1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)N)Br,
8,Ki,2.2910,CNC1=NC(=CC(=C1)S(=O)(=O)C2=CC=C(C=C2)N)Br,
9,Ki,0.1148,C1CN(CCN1)C2=NC(=CC(=C2)S(=O)(=O)C3=CC=C(C=C3)...,
11,Ki,53.7000,C1=CC(=CC=C1N)S(=O)(=O)C2=CC(=NC(=C2)Br)Br,
13,Ki,54.9500,CNC1=CC(=NC(=N1)NC)NS(=O)(=O)C2=CC=C(C=C2)N.Br.Br,
...,...,...,...,...
5193,Ki,58.0000,COc1ccc(Cc2nc(N3CCNCC3)nc3cccnc23)c(OC)c1,7.24
5201,Ki,20.0000,O=C1Cc2cc(CCN3CCN(c4nsc5ccccc45)CC3)c(Cl)cc2N1,7.70
5202,Ki,176.0000,O=S(=O)(c1cccc2ncccc12)N1CCCCC[C@H]1CCN1CCN(c2...,6.75
5203,Ki,104.0000,O=S(=O)(c1cncc2ccccc12)N1CC[C@@H]1CCN1CCN(c2cc...,6.98


In [267]:
concat_df.to_csv(r'C:\Users\pecho\Documents\serotonin_receptor_project\databases\clean_dbs\concat_db_human_5ht6.csv', sep='\t', index=False)