In [None]:
!pip install numpy<2
!pip install rdkit-pypi -q

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

import warnings
warnings.filterwarnings(action='ignore')

/bin/bash: line 1: 2: No such file or directory


In [None]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/dacon_1/train.csv'

train_df = pd.read_csv(file_path)

Mounted at /content/drive


1st

In [None]:
def get_features_from_smiles(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return None

    # Morgan Fingerprint (1024-bit, radius=2)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    fp_array = np.array(fp)

    descriptors = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol)
    ]

    return np.concatenate([fp_array, descriptors])

tqdm.pandas()
feature_matrix = train_df['Canonical_Smiles'].progress_apply(get_features_from_smiles)
print("success")

valid_indices = feature_matrix.dropna().index
feature_list_clean = feature_matrix.dropna().tolist()

df_clean = train_df.loc[valid_indices].reset_index(drop=True)

fp_columns = [f'fp_{i}' for i in range(1024)]
desc_columns = ['MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds', 'TPSA']
feature_df = pd.DataFrame(feature_list_clean, columns=fp_columns + desc_columns)

feature_df.insert(0, 'ID', df_clean['ID'])
feature_df['Inhibition'] = df_clean['Inhibition']

output_file_path = '/content/drive/MyDrive/input_candidates_01_0.csv'
feature_df.to_csv(output_file_path, index=False)

  0%|          | 0/1681 [00:00<?, ?it/s]

success


2nd

In [None]:
def get_features_from_smiles_ecfp(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return None

    # ECFP (1024-bit, radius=2, useFeatures=False)
    # ECFP4와 유사
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=False)
    fp_array = np.array(fp)

    descriptors = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol)
    ]

    return np.concatenate([fp_array, descriptors])

tqdm.pandas()
feature_matrix = train_df['Canonical_Smiles'].progress_apply(get_features_from_smiles_ecfp)
print("ECFP-like feature extraction success")

valid_indices = feature_matrix.dropna().index
feature_list_clean = feature_matrix.dropna().tolist()

df_clean = train_df.loc[valid_indices].reset_index(drop=True)

fp_columns = [f'ecfp_fp_{i}' for i in range(1024)]
desc_columns = ['MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds', 'TPSA']
feature_df = pd.DataFrame(feature_list_clean, columns=fp_columns + desc_columns)

feature_df.insert(0, 'ID', df_clean['ID'])
feature_df['Inhibition'] = df_clean['Inhibition']

output_file_path = '/content/drive/MyDrive/input_candidates_01_1.csv'
feature_df.to_csv(output_file_path, index=False)

  0%|          | 0/1681 [00:00<?, ?it/s]

ECFP-like feature extraction success


3rd

In [None]:
def get_features_from_smiles_fcfp(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return None

    # FCFP 유사 지문 (1024-bit, radius=2, useFeatures=True)
    # FCFP4와 유사
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True)
    fp_array = np.array(fp)

    descriptors = [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol)
    ]

    return np.concatenate([fp_array, descriptors])

tqdm.pandas()
feature_matrix = train_df['Canonical_Smiles'].progress_apply(get_features_from_smiles_fcfp)
print("FCFP-like feature extraction success")

valid_indices = feature_matrix.dropna().index
feature_list_clean = feature_matrix.dropna().tolist()

df_clean = train_df.loc[valid_indices].reset_index(drop=True)

fp_columns = [f'fcfp_fp_{i}' for i in range(1024)]
desc_columns = ['MolWt', 'LogP', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds', 'TPSA']
feature_df = pd.DataFrame(feature_list_clean, columns=fp_columns + desc_columns)

feature_df.insert(0, 'ID', df_clean['ID'])
feature_df['Inhibition'] = df_clean['Inhibition']

output_file_path = '/content/drive/MyDrive/input_candidates_01_2.csv'
feature_df.to_csv(output_file_path, index=False)

  0%|          | 0/1681 [00:00<?, ?it/s]

FCFP-like feature extraction success
