<a href="https://colab.research.google.com/github/jyryu3161/lec_bioai/blob/main/featurization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
file_uploaded = files.upload()

Saving input.csv to input.csv


# 입력 파일 설정

In [2]:

input_file = 'input.csv' # 입력 파일명만 입력


## 환경 구성

In [3]:
# download & extract
url = 'https://anaconda.org/rdkit/rdkit/2018.09.1.0/download/linux-64/rdkit-2018.09.1.0-py36h71b666b_1.tar.bz2'
!curl -L $url | tar xj lib
# move to python packages directory
!mv lib/python3.6/site-packages/rdkit /usr/local/lib/python3.6/dist-packages/
x86 = '/usr/lib/x86_64-linux-gnu'
!mv lib/*.so.* $x86/
# rdkit need libboost_python3.so.1.65.1
!ln -s $x86/libboost_python3-py36.so.1.65.1 $x86/libboost_python3.so.1.65.1

!pip install --pre deepchem

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4051    0  4051    0     0   8474      0 --:--:-- --:--:-- --:--:--  8457
100 20.2M  100 20.2M    0     0  3634k      0  0:00:05  0:00:05 --:--:-- 4455k
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.6.2.dev20221004063301-py3-none-any.whl (691 kB)
[K     |████████████████████████████████| 691 kB 7.4 MB/s 
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 1.2 MB/s 
Installing collected packages: rdkit-pypi, deepchem
Successfully installed deepchem-2.6.2.dev20221004063301 rdkit-pypi-2022.3.5


# 프로그램 실행

In [4]:
import pandas as pd
import numpy as np
import deepchem as dc
import os

############### molecular descriptor 계산 ###############
featurizer = dc.feat.RDKitDescriptors()

X_list = []
y_list = []
smiles_list = []

with open(input_file, 'r') as fp:
    fp.readline()
    for line in fp:
        sptlist = line.strip().split(',')
        smiles = sptlist[0].strip()
        label = sptlist[1].strip()
        
        features = featurizer.featurize(smiles) # 하나의 smiles에 대한 molecular descriptor 계산 
        descriptor_feature = features[0]
        
        if descriptor_feature.shape[0] != len(featurizer.descriptors): # molecular descriptor가 계산이 안되는 화합물은 고려하지 않음
            continue

        smiles_list.append(smiles)
        X_list.append(descriptor_feature)
        y_list.append(label)

X_list = np.asarray(X_list)
y_list = np.asarray(y_list)
smiles_list = np.asarray(smiles_list)

df_tmp = pd.DataFrame(X_list, columns=featurizer.descriptors)
df_tmp['label'] = y_list
df_tmp['smiles'] = smiles_list

output_file_descriptor = 'output_descriptor_%s'%(os.path.basename(input_file))
df_tmp.to_csv(output_file_descriptor, index=False)


############### molecular fingerprint 계산 ###############
featurizer = dc.feat.CircularFingerprint(size=1024, radius=2)

X_list = []
y_list = []
smiles_list = []
cols_names = []
for i in range(1024):
    cols_names.append('x%s'%(i+1))

with open(input_file, 'r') as fp:
    fp.readline()
    for line in fp:
        sptlist = line.strip().split(',')
        smiles = sptlist[0].strip()
        label = sptlist[1].strip()

        features = featurizer.featurize(smiles) # 하나의 smiles에 대한 molecular fingerprint 계산 
        fingerprint_feature = features[0]

        if fingerprint_feature.shape[0] != 1024: # molecular fingerprint가 계산이 안되는 화합물은 고려하지 않음
            continue

        smiles_list.append(smiles)
        X_list.append(fingerprint_feature)
        y_list.append(label)

X_list = np.asarray(X_list)
y_list = np.asarray(y_list)
smiles_list = np.asarray(smiles_list)

df_tmp = pd.DataFrame(X_list, columns=cols_names)
df_tmp['label'] = y_list
df_tmp['smiles'] = smiles_list

output_file_fingerprint = 'output_fingerprint_%s'%(os.path.basename(input_file))
df_tmp.to_csv(output_file_fingerprint, index=False)


