# Installing Libraries

In [20]:
!apt-get install -y default-jre

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  ca-certificates-java default-jre-headless fonts-dejavu-core
  fonts-dejavu-extra java-common libatk-wrapper-java libatk-wrapper-java-jni
  libpcsclite1 libxtst6 libxxf86dga1 openjdk-11-jre openjdk-11-jre-headless
  x11-utils
Suggested packages:
  pcscd libnss-mdns fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei | fonts-wqy-zenhei fonts-indic mesa-utils
The following NEW packages will be installed:
  ca-certificates-java default-jre default-jre-headless fonts-dejavu-core
  fonts-dejavu-extra java-common libatk-wrapper-java libatk-wrapper-java-jni
  libpcsclite1 libxtst6 libxxf86dga1 openjdk-11-jre openjdk-11-jre-headless
  x11-utils
0 upgraded, 14 newly installed, 0 to remove and 41 not upgraded.
Need to get 46.2 MB of archives.
After this operation, 189 MB of additional disk space will be used.
Get:1 http://archive.ubuntu

# Downloading PaDEL-Descriptor

In [3]:
!pip install padelpy
from padelpy import padeldescriptor
import pandas as pd

Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


# Reading data

In [4]:
import pandas as pd


In [7]:
df = pd.read_csv("/content/acetylcholinesterase_05_bioactivity_data_2class_pIC50.csv")

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,312.325,2.8032,0.0,6.0,6.124939
1,1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,376.913,4.5546,0.0,5.0,7.0
2,2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,426.851,5.3574,0.0,5.0,4.30103
3,3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,404.845,4.7069,0.0,5.0,6.522879
4,4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,346.334,3.0953,0.0,6.0,6.09691


# Selecting Columns

In [10]:
selection = ['canonical_smiles','molecule_chembl_id']
df_selection = df[selection]
df_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [11]:
! cat molecule.smi | head -5

CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1	CHEMBL133897
O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1	CHEMBL336398
CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1	CHEMBL131588
O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F	CHEMBL130628
CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C	CHEMBL130478


In [12]:
! cat molecule.smi | wc -l

5135


# Calculate Fingerprint descriptors

## Calculate PaDEL descriptors

In [13]:
!ls

acetylcholinesterase_03_bioactivity_data_curated.csv	   molecule.smi
acetylcholinesterase_05_bioactivity_data_2class_pIC50.csv  sample_data


In [17]:
!mkdir mols

In [18]:
!mv molecule.smi mols/

In [21]:
from padelpy import padeldescriptor

padeldescriptor(
    mol_dir='mols',
    d_file='descriptors.csv',
    fingerprints=True,
    retainorder=True,
    removesalt=True,
    standardizenitro=True
)


# Preparing X and Y matrices

In [24]:
df_X = pd.read_csv('descriptors.csv')

In [25]:
df_X = df_X.drop(columns=['Name'])


In [27]:
df_Y = df['pIC50']
df_Y

Unnamed: 0,pIC50
0,6.124939
1,7.000000
2,4.301030
3,6.522879
4,6.096910
...,...
5130,6.517126
5131,6.386158
5132,6.403403
5133,6.204120


In [28]:
dataset = pd.concat([df_X,df_Y], axis=1)
dataset

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.124939
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.000000
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.301030
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.522879
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.096910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5130,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.517126
5131,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.386158
5132,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.403403
5133,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.204120


In [30]:
dataset.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)