<a href="https://colab.research.google.com/github/jyryu3161/lec_bioai/blob/main/featurization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 입력 파일 설정

In [None]:

input_file = '1.herg_data.csv' # 입력 파일명만 입력


## 환경 구성

In [None]:
!pip install  deepchem==2.5.0
!pip install rdkit-pypi

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176721 sha256=963bf1ad5ad9bc9f34042a4056fba9ae8e492fb2bb317043c665ebc593ab7a0b
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstall

# 프로그램 실행

In [None]:
import pandas as pd
import numpy as np
import deepchem as dc
import os
from rdkit import Chem
from rdkit.Chem import Descriptors

############### molecular descriptor 계산 ###############

X_list = []
y_list = []
smiles_list = []
descriptor_names = []
with open(input_file, 'r') as fp:
    fp.readline()
    for line in fp:
        sptlist = line.strip().split(',')
        smiles = sptlist[0].strip()
        label = sptlist[1].strip()

        mol = Chem.MolFromSmiles(smiles)
        all_descriptors = {}
        for desc_name, desc_func in Descriptors._descList:
            try:
                all_descriptors[desc_name] = desc_func(mol)
            except:
                pass

        descriptor_names = list(all_descriptors.keys())
        descriptor_feature = np.asarray(list(all_descriptors.values()))

        if descriptor_feature.shape[0] != 208: # molecular descriptor가 계산이 안되는 화합물은 고려하지 않음
            continue

        smiles_list.append(smiles)
        X_list.append(descriptor_feature)
        y_list.append(label)

X_list = np.asarray(X_list)
y_list = np.asarray(y_list)
smiles_list = np.asarray(smiles_list)

df_tmp = pd.DataFrame(X_list, columns=descriptor_names)
df_tmp['label'] = y_list
df_tmp['smiles'] = smiles_list

output_file_descriptor = 'output_descriptor_%s'%(os.path.basename(input_file))
df_tmp.to_csv(output_file_descriptor, index=False)


############### molecular fingerprint 계산 ###############
featurizer = dc.feat.CircularFingerprint(size=1024, radius=2)

X_list = []
y_list = []
smiles_list = []
cols_names = []
for i in range(1024):
    cols_names.append('x%s'%(i+1))

with open(input_file, 'r') as fp:
    fp.readline()
    for line in fp:
        sptlist = line.strip().split(',')
        smiles = sptlist[0].strip()
        label = sptlist[1].strip()
        mol = Chem.MolFromSmiles(smiles)
        fingerprint_feature = featurizer.featurize(smiles)
        if fingerprint_feature.shape[1] != 1024: # molecular fingerprint가 계산이 안되는 화합물은 고려하지 않음
            continue
        tmp_list = fingerprint_feature[0]
        smiles_list.append(smiles)
        X_list.append(tmp_list)
        y_list.append(label)

print(X_list)
X_list = np.asarray(X_list)
y_list = np.asarray(y_list)
smiles_list = np.asarray(smiles_list)

df_tmp = pd.DataFrame(X_list, columns=cols_names)
df_tmp['label'] = y_list
df_tmp['smiles'] = smiles_list

output_file_fingerprint = 'output_fingerprint_%s'%(os.path.basename(input_file))
df_tmp.to_csv(output_file_fingerprint, index=False)


[08:56:14] non-ring atom 1 marked aromatic
[08:56:16] SMILES Parse Error: syntax error while parsing: [H][N
[08:56:16] SMILES Parse Error: Failed parsing SMILES '[H][N' for input: '[H][N'
[08:56:23] non-ring atom 1 marked aromatic
[08:56:23] non-ring atom 1 marked aromatic
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[08:56:23] SMILES Parse Error: syntax error while parsing: [H][N
[08:56:23] SMILES Parse Error: Failed parsing SMILES '[H][N' for input: '[H][N'
[08:56:23] SMILES Parse Error: syntax error while parsing: [H][N
[08:56:23] SMILES Parse Error: Failed parsing SMILES '[H][N' for input: '[H][N'
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)


[array([0., 1., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 1., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 1., 1., ..., 0., 0., 0.]), array([0., 